# Related Works
# RL
@article{huh2023multi,
  title   = {Multi-agent reinforcement learning: A comprehensive survey},
  author  = {Huh, Dom and Mohapatra, Prasant},
  journal = {arXiv preprint arXiv:2312.10256},
  year    = {2023}
}

@article{ghasemi2024comprehensive,
  title   = {A comprehensive survey of reinforcement learning: From algorithms to practical challenges},
  author  = {Ghasemi, Majid and Moosavi, Amir Hossein and Ebrahimi, Dariush},
  journal = {arXiv preprint arXiv:2411.18892},
  year    = {2024}
}

@article{ai2025inquiremobile,
  title={InquireMobile: Teaching VLM-based Mobile Agent to Request Human Assistance via Reinforcement Fine-Tuning},
  author={Ai, Qihang and Bu, Pi and Cao, Yue and Wang, Yingyao and Gu, Jihao and Xing, Jingxuan and Zhu, Zekun and Jiang, Wei and Zheng, Zhicheng and Song, Jun and others},
  journal={arXiv preprint arXiv:2508.19679},
  year={2025}
}

@article{xia2025visionary,
  title   = {Visionary-r1: Mitigating shortcuts in visual reasoning with reinforcement learning},
  author  = {Xia, Jiaer and Zang, Yuhang and Gao, Peng and Li, Yixuan and Zhou, Kaiyang},
  journal = {arXiv preprint arXiv:2505.14677},
  year    = {2025}
}

@article{wang2025vision,
  title={Vision-Zero: Scalable VLM Self-Improvement via Strategic Gamified Self-Play},
  author={Wang, Qinsi and Liu, Bo and Zhou, Tianyi and Shi, Jing and Lin, Yueqian and Chen, Yiran and Li, Hai Helen and Wan, Kun and Zhao, Wentian},
  journal={arXiv preprint arXiv:2509.25541},
  year={2025}
}

@article{long2025adsqa,
  title={AdsQA: Towards Advertisement Video Understanding},
  author={Long, Xinwei and Tian, Kai and Xu, Peng and Jia, Guoli and Li, Jingxuan and Yang, Sa and Shao, Yihua and Zhang, Kaiyan and Jiang, Che and Xu, Hao and others},
  journal={arXiv preprint arXiv:2509.08621},
  year={2025}
}


@article{kan2025taco,
  title   = {TACO: Think-Answer Consistency for Optimized Long-Chain Reasoning and Efficient Data Learning via Reinforcement Learning in LVLMs},
  author  = {Kan, Zhehan and Liu, Yanlin and Yin, Kun and Jiang, Xinghua and Li, Xin and Cao, Haoyu and Liu, Yinsong and Jiang, Deqiang and Sun, Xing and Liao, Qingmin and others},
  journal = {arXiv preprint arXiv:2505.20777},
  year    = {2025}
}


@article{fan2025grit,
  title   = {GRIT: Teaching MLLMs to Think with Images},
  author  = {Fan, Yue and He, Xuehai and Yang, Diji and Zheng, Kaizhi and Kuo, Ching-Chen and Zheng, Yuting and Narayanaraju, Sravana Jyothi and Guan, Xinze and Wang, Xin Eric},
  journal = {arXiv preprint arXiv:2505.15879},
  year    = {2025}
}

@article{huang2025vision,
  title={Vision-r1: Incentivizing reasoning capability in multimodal large language models},
  author={Huang, Wenxuan and Jia, Bohan and Zhai, Zijie and Cao, Shaosheng and Ye, Zheyu and Zhao, Fei and Xu, Zhe and Hu, Yao and Lin, Shaohui},
  journal={arXiv preprint arXiv:2503.06749},
  year={2025}
}

@article{shen2025vlm,
  title   = {Vlm-r1: A stable and generalizable r1-style large vision-language model},
  author  = {Shen, Haozhan and Liu, Peng and Li, Jingcheng and Fang, Chunxin and Ma, Yibo and Liao, Jiajia and Shen, Qiaoli and Zhang, Zilun and Zhao, Kangjia and Zhang, Qianqian and others},
  journal = {arXiv preprint arXiv:2504.07615},
  year    = {2025}
}

@article{cao2025ground,
  title   = {Ground-R1: Incentivizing Grounded Visual Reasoning via Reinforcement Learning},
  author  = {Cao, Meng and Zhao, Haoze and Zhang, Can and Chang, Xiaojun and Reid, Ian and Liang, Xiaodan},
  journal = {arXiv preprint arXiv:2505.20272},
  year    = {2025}
}

@article{chu2025qwen,
  title   = {Qwen Look Again: Guiding Vision-Language Reasoning Models to Re-attention Visual Information},
  author  = {Chu, Xu and Chen, Xinrong and Wang, Guanyu and Tan, Zhijie and Huang, Kui and Lv, Wenyu and Mo, Tong and Li, Weiping},
  journal = {arXiv preprint arXiv:2505.23558},
  year    = {2025}
}

@article{xu2025visual,
  title   = {Visual Planning: Let's Think Only with Images},
  author  = {Xu, Yi and Li, Chengzu and Zhou, Han and Wan, Xingchen and Zhang, Caiqi and Korhonen, Anna and Vuli{\'c}, Ivan},
  journal = {arXiv preprint arXiv:2505.11409},
  year    = {2025}
}


@article{zhang2024survey,
  title   = {A survey on self-play methods in reinforcement learning},
  author  = {Zhang, Ruize and Xu, Zelai and Ma, Chengdong and Yu, Chao and Tu, Wei-Wei and Tang, Wenhao and Huang, Shiyu and Ye, Deheng and Ding, Wenbo and Yang, Yaodong and others},
  journal = {arXiv preprint arXiv:2408.01072},
  year    = {2024}
}

@article{wu2025reinforcement,
  title   = {Reinforcement Learning in Vision: A Survey},
  author  = {Wu, Weijia and Gao, Chen and Chen, Joya and Lin, Kevin Qinghong and Meng, Qingwei and Zhang, Yiming and Qiu, Yuke and Zhou, Hong and Shou, Mike Zheng},
  journal = {arXiv preprint arXiv:2508.08189},
  year    = {2025}
}


@article{tan2025chartmaster,
  title   = {ChartMaster: Advancing Chart-to-Code Generation with Real-World Charts and Chart Similarity Reinforcement Learning},
  author  = {Tan, Wentao and Cao, Qiong and Xue, Chao and Zhan, Yibing and Ding, Changxing and He, Xiaodong},
  journal = {arXiv preprint arXiv:2508.17608},
  year    = {2025}
}

# LLM/LRM
@article{zhao2023survey,
  title   = {A survey of large language models},
  author  = {Zhao, Wayne Xin and Zhou, Kun and Li, Junyi and Tang, Tianyi and Wang, Xiaolei and Hou, Yupeng and Min, Yingqian and Zhang, Beichen and Zhang, Junjie and Dong, Zican and others},
  journal = {arXiv preprint arXiv:2303.18223},
  volume  = {1},
  number  = {2},
  year    = {2023}
}

@article{zhang2025100,
  title   = {100 days after deepseek-r1: A survey on replication studies and more directions for reasoning language models},
  author  = {Zhang, Chong and Deng, Yue and Lin, Xiang and Wang, Bin and Ng, Dianwen and Ye, Hai and Li, Xingxuan and Xiao, Yao and Mo, Zhanfeng and Zhang, Qi and others},
  journal = {arXiv preprint arXiv:2505.00551},
  year    = {2025}
}

@article{li2025system,
  title   = {From system 1 to system 2: A survey of reasoning large language models},
  author  = {Li, Zhong-Zhi and Zhang, Duzhen and Zhang, Ming-Liang and Zhang, Jiaxin and Liu, Zengyan and Yao, Yuxuan and Xu, Haotian and Zheng, Junhao and Wang, Pei-Jie and Chen, Xiuyi and others},
  journal = {arXiv preprint arXiv:2502.17419},
  year    = {2025}
}

@article{xu2025towards,
  title   = {Towards large reasoning models: A survey of reinforced reasoning with large language models},
  author  = {Xu, Fengli and Hao, Qianyue and Zong, Zefang and Wang, Jingwei and Zhang, Yunke and Wang, Jingyi and Lan, Xiaochong and Gong, Jiahui and Ouyang, Tianjian and Meng, Fanjin and others},
  journal = {arXiv preprint arXiv:2501.09686},
  year    = {2025}
}

@article{srivastava2025technical,
  title   = {A Technical Survey of Reinforcement Learning Techniques for Large Language Models},
  author  = {Srivastava, Saksham Sahai and Aggarwal, Vaneet},
  journal = {arXiv preprint arXiv:2507.04136},
  year    = {2025}
}

@article{wu2025sailing,
  title   = {Sailing by the Stars: A Survey on Reward Models and Learning Strategies for Learning from Rewards},
  author  = {Wu, Xiaobao},
  journal = {arXiv preprint arXiv:2505.02686},
  year    = {2025}
}

@article{sun2025survey,
  title     = {A survey of reasoning with foundation models: Concepts, methodologies, and outlook},
  author    = {Sun, Jiankai and Zheng, Chuanyang and Xie, Enze and Liu, Zhengying and Chu, Ruihang and Qiu, Jianing and Xu, Jiaqi and Ding, Mingyu and Li, Hongyang and Geng, Mengzhe and others},
  journal   = {ACM Computing Surveys},
  volume    = {57},
  number    = {11},
  pages     = {1--43},
  year      = {2025},
  publisher = {ACM New York, NY}
}
@article{shen2025skywork,
  title={Skywork-r1v3 technical report},
  author={Shen, Wei and Pei, Jiangbo and Peng, Yi and Song, Xuchen and Liu, Yang and Peng, Jian and Sun, Haofeng and Hao, Yunzhuo and Wang, Peiyu and Zhang, Jianhao and others},
  journal={arXiv preprint arXiv:2507.06167},
  year={2025}
}
@article{xiaomi2025mimo,
  title={MiMo: Unlocking the Reasoning Potential of Language Model--From Pretraining to Posttraining},
  author={Xiaomi, LLM and Xia, Bingquan and Shen, Bowen and Zhu, Dawei and Zhang, Di and Wang, Gang and Zhang, Hailin and Liu, Huaqiu and Xiao, Jiebao and Dong, Jinhao and others},
  journal={arXiv preprint arXiv:2505.07608},
  year={2025}
}
@article{team2025intellect,
  title={INTELLECT-2: A Reasoning Model Trained Through Globally Decentralized Reinforcement Learning},
  author={Team, Prime Intellect and Jaghouar, Sami and Mattern, Justus and Ong, Jack Min and Straube, Jannik and Basra, Manveer and Pazdera, Aaron and Thaman, Kushal and Di Ferrante, Matthew and Gabriel, Felix and others},
  journal={arXiv preprint arXiv:2505.07291},
  year={2025}
}
@article{team2025hunyuan,
  title={Hunyuan-turbos: Advancing large language models through mamba-transformer synergy and adaptive chain-of-thought},
  author={Team, Tencent Hunyuan and Liu, Ao and Zhou, Botong and Xu, Can and Zhou, Chayse and Zhang, ChenChen and Xu, Chengcheng and Wang, Chenhao and Wu, Decheng and Wu, Dengpeng and others},
  journal={arXiv preprint arXiv:2505.15431},
  year={2025}
}

@misc{deepseekai2024deepseekv32,
      title={DeepSeek-V3.2-Exp: Boosting Long-Context Efficiency with DeepSeek Sparse Attention}, 
      author={DeepSeek-AI},
      year={2025},
      url={https://github.com/deepseek-ai/DeepSeek-V3.2-Exp/blob/main/DeepSeek_V3_2.pdf}
}

@misc{GLM4.6,
      title={GLM-4.6: Advanced Agentic, Reasoning and Coding Capabilities}, 
      author={Zhipu-AI},
      year={2025},
      url={https://z.ai/blog/glm-4.6}
}

@misc{Ring1T,
      title={Ring-1T-preview, Deep Thinking, No Waiting}, 
      author={inclusionAI},
      year={2025},
      url={https://huggingface.co/inclusionAI/Ring-1T-preview}
}

@misc{Qwen3VL,
      title={Qwen3-VL: Sharper Vision, Deeper Thought, Broader Action}, 
      author={Alibaba-Qwen},
      year={2025},
      url={https://qwen.ai/blog?id=99f0335c4ad9ff6153e517418d48535ab6d8afef&from=research.latest-advancements-list}
}

@misc{Qwen3next,
      title={Qwen3-Next: Towards Ultimate Training & Inference Efficiency}, 
      author={Alibaba-Qwen},
      year={2025},
      url={https://qwen.ai/blog?id=4074cca80393150c248e508aa62983f9cb7d27cd&from=research.latest-advancements-list}
}

@misc{Ringmini2.0,
      title={Ring-mini-2.0}, 
      author={inclusionAI},
      year={2025},
      url={https://huggingface.co/inclusionAI/Ring-mini-2.0}
}

# RL Methodology
## Reward Design - Verifiable Rewards
@article{sun2025freeprm,
  title   = {FreePRM: Training Process Reward Models Without Ground Truth Process Labels},
  author  = {Sun, Lin and Liu, Chuang and Ma, Xiaofeng and Yang, Tao and Lu, Weijia and Wu, Ning},
  journal = {arXiv preprint arXiv:2506.03570},
  year    = {2025}
}

@article{setlur2024rewarding,
  title   = {Rewarding progress: Scaling automated process verifiers for llm reasoning},
  author  = {Setlur, Amrith and Nagpal, Chirag and Fisch, Adam and Geng, Xinyang and Eisenstein, Jacob and Agarwal, Rishabh and Agarwal, Alekh and Berant, Jonathan and Kumar, Aviral},
  journal = {arXiv preprint arXiv:2410.08146},
  year    = {2024}
}


@article{lin2025r1,
  title   = {OS-R1: Agentic Operating System Kernel Tuning with Reinforcement Learning},
  author  = {Lin, Hongyu and Li, Yuchen and Luo, Haoran and Yao, Kaichun and Zhang, Libo and Xing, Mingjie and Wu, Yanjun},
  journal = {arXiv preprint arXiv:2508.12551},
  year    = {2025}
}

@article{ye2025mobile,
  title   = {Mobile-Agent-v3: Foundamental Agents for GUI Automation},
  author  = {Ye, Jiabo and Zhang, Xi and Xu, Haiyang and Liu, Haowei and Wang, Junyang and Zhu, Zhaoqing and Zheng, Ziwei and Gao, Feiyu and Cao, Junjie and Lu, Zhengxi and others},
  journal = {arXiv preprint arXiv:2508.15144},
  year    = {2025}
}

@article{lu2025swirl,
  title   = {SWIRL: A Staged Workflow for Interleaved Reinforcement Learning in Mobile GUI Control},
  author  = {Lu, Quanfeng and Ma, Zhantao and Zhong, Shuai and Wang, Jin and Yu, Dahai and Ng, Michael K and Luo, Ping},
  journal = {arXiv preprint arXiv:2508.20018},
  year    = {2025}
}

@article{zhou2025reinforcing,
  title   = {Reinforcing General Reasoning without Verifiers},
  author  = {Zhou, Xiangxin and Liu, Zichen and Sims, Anya and Wang, Haonan and Pang, Tianyu and Li, Chongxuan and Wang, Liang and Lin, Min and Du, Chao},
  journal = {arXiv preprint arXiv:2505.21493},
  year    = {2025}
}

@article{yu2025rlpr,
  title   = {RLPR: Extrapolating RLVR to General Domains without Verifiers},
  author  = {Yu, Tianyu and Ji, Bo and Wang, Shouli and Yao, Shu and Wang, Zefan and Cui, Ganqu and Yuan, Lifan and Ding, Ning and Yao, Yuan and Liu, Zhiyuan and others},
  journal = {arXiv preprint arXiv:2506.18254},
  year    = {2025}
}

@article{li2025verifybench,
  title   = {VerifyBench: A Systematic Benchmark for Evaluating Reasoning Verifiers Across Domains},
  author  = {Li, Xuzhao and Li, Xuchen and Hu, Shiyu and Guo, Yongzhen and Zhang, Wentao},
  journal = {arXiv preprint arXiv:2507.09884},
  year    = {2025}
}

@article{zhao2025one,
  title   = {One Token to Fool LLM-as-a-Judge},
  author  = {Zhao, Yulai and Liu, Haolin and Yu, Dian and Kung, SY and Mi, Haitao and Yu, Dong},
  journal = {arXiv preprint arXiv:2507.08794},
  year    = {2025}
}

@article{huang2025pitfalls,
  title   = {Pitfalls of Rule-and Model-based Verifiers--A Case Study on Mathematical Reasoning},
  author  = {Huang, Yuzhen and Zeng, Weihao and Zeng, Xingshan and Zhu, Qi and He, Junxian},
  journal = {arXiv preprint arXiv:2505.22203},
  year    = {2025}
}

@misc{kimiteam2025kimik2openagentic,
  title         = {Kimi K2: Open Agentic Intelligence},
  author        = {Kimi Team},
  year          = {2025},
  eprint        = {2507.20534},
  archiveprefix = {arXiv},
  primaryclass  = {cs.LG},
  url           = {https://arxiv.org/abs/2507.20534}
}

## Reward Design - Unsupervised rewards

@article{wang2025thinking,
  title={Thinking Augmented Pre-training},
  author={Wang, Liang and Yang, Nan and Huang, Shaohan and Dong, Li and Wei, Furu},
  journal={arXiv preprint arXiv:2509.20186},
  year={2025}
}

@article{liu2025ettrl,
  title   = {ETTRL: Balancing Exploration and Exploitation in LLM Test-Time Reinforcement Learning Via Entropy Mechanism},
  author  = {Liu, Jia and He, ChangYi and Lin, YingQiao and Yang, MingMin and Shen, FeiYang and Liu, ShaoGuo and Gao, TingTing},
  journal = {arXiv preprint arXiv:2508.11356},
  year    = {2025}
}

@article{chen2025selfques,
  title   = {Self-Questioning Language Models},
  author  = {Chen, Lili and Prabhudesai, Mihir and Fragkiadaki, Katerina and Liu, Hao and Pathak, Deepak},
  journal = {arXiv preprint arXiv:2508.03682},
  year    = {2025}
}

@article{van2025post,
  title   = {Post-Training Large Language Models via Reinforcement Learning from Self-Feedback},
  author  = {van Niekerk, Carel and Vukovic, Renato and Ruppik, Benjamin Matthias and Lin, Hsien-chin and Ga{\v{s}}i{\'c}, Milica},
  journal = {arXiv preprint arXiv:2507.21931},
  year    = {2025}
}

@article{kiruluta2025self,
  title   = {A Self-Supervised Reinforcement Learning Approach for Fine-Tuning Large Language Models Using Cross-Attention Signals},
  author  = {Kiruluta, Andrew and Lemos, Andreas and Burity, Priscilla},
  journal = {arXiv preprint arXiv:2502.10482},
  year    = {2025}
}

@article{zhang2025co,
  title   = {Co-Reward: Self-supervised Reinforcement Learning for Large Language Model Reasoning via Contrastive Agreement},
  author  = {Zhang, Zizhuo and Zhu, Jianing and Ge, Xinmu and Zhao, Zihua and Zhou, Zhanke and Li, Xuan and Feng, Xiao and Yao, Jiangchao and Han, Bo},
  journal = {arXiv preprint arXiv:2508.00410},
  year    = {2025}
}



@article{burns2023weak,
  title   = {Weak-to-strong generalization: Eliciting strong capabilities with weak supervision},
  author  = {Burns, Collin and Izmailov, Pavel and Kirchner, Jan Hendrik and Baker, Bowen and Gao, Leo and Aschenbrenner, Leopold and Chen, Yining and Ecoffet, Adrien and Joglekar, Manas and Leike, Jan and others},
  journal = {arXiv preprint arXiv:2312.09390},
  year    = {2023}
}

@misc{lai2025computerrlscalingendtoendonline,
  title         = {ComputerRL: Scaling End-to-End Online Reinforcement Learning for Computer Use Agents},
  author        = {Hanyu Lai and Xiao Liu and Yanxiao Zhao and Han Xu and Hanchen Zhang and Bohao Jing and Yanyu Ren and Shuntian Yao and Yuxiao Dong and Jie Tang},
  year          = {2025},
  eprint        = {2508.14040},
  archiveprefix = {arXiv},
  primaryclass  = {cs.AI},
  url           = {https://arxiv.org/abs/2508.14040}
}

@article{li2024numinamath,
  title   = {Numinamath: The largest public dataset in ai4maths with 860k pairs of competition math problems and solutions},
  author  = {Li, Jia and Beeching, Edward and Tunstall, Lewis and Lipkin, Ben and Soletskyi, Roman and Huang, Shengyi and Rasul, Kashif and Yu, Longhui and Jiang, Albert Q and Shen, Ziju and others},
  journal = {Hugging Face repository},
  volume  = {13},
  pages   = {9},
  year    = {2024}
}

@article{glazer2024frontiermath,
  title   = {Frontiermath: A benchmark for evaluating advanced mathematical reasoning in ai},
  author  = {Glazer, Elliot and Erdil, Ege and Besiroglu, Tamay and Chicharro, Diego and Chen, Evan and Gunning, Alex and Olsson, Caroline Falkman and Denain, Jean-Stanislas and Ho, Anson and Santos, Emily de Oliveira and others},
  journal = {arXiv preprint arXiv:2411.04872},
  year    = {2024}
}

@misc{liu2025there,
  title  = {There may not be aha moment in r1-zero-like training—a pilot study},
  author = {Liu, Zichen and Chen, Changyu and Li, Wenjun and Pang, Tianyu and Du, Chao and Lin, Min},
  year   = {2025}
}

@article{song2024mind,
  title   = {Mind the gap: Examining the self-improvement capabilities of large language models},
  author  = {Song, Yuda and Zhang, Hanlin and Eisenach, Carson and Kakade, Sham and Foster, Dean and Ghai, Udaya},
  journal = {arXiv preprint arXiv:2412.02674},
  year    = {2024}
}

@article{gao2025uishift,
  title   = {UIShift: Enhancing VLM-based GUI Agents through Self-supervised Reinforcement Learning},
  author  = {Gao, Longxi and Zhang, Li and Xu, Mengwei},
  journal = {arXiv preprint arXiv:2505.12493},
  year    = {2025}
}

@article{du2025test,
  title   = {Test-Time Reinforcement Learning for GUI Grounding via Region Consistency},
  author  = {Du, Yong and Yan, Yuchen and Tang, Fei and Lu, Zhengxi and Zong, Chang and Lu, Weiming and Jiang, Shengpei and Shen, Yongliang},
  journal = {arXiv preprint arXiv:2508.05615},
  year    = {2025}
}


@article{shi2025mobilegui,
  title   = {MobileGUI-RL: Advancing Mobile GUI Agent through Reinforcement Learning in Online Environment},
  author  = {Shi, Yucheng and Yu, Wenhao and Li, Zaitang and Wang, Yonglin and Zhang, Hongming and Liu, Ninghao and Mi, Haitao and Yu, Dong},
  journal = {arXiv preprint arXiv:2507.05720},
  year    = {2025}
}

@article{wu2025gui,
  title   = {GUI-Reflection: Empowering Multimodal GUI Models with Self-Reflection Behavior},
  author  = {Wu, Penghao and Ma, Shengnan and Wang, Bo and Yu, Jiaheng and Lu, Lewei and Liu, Ziwei},
  journal = {arXiv preprint arXiv:2506.08012},
  year    = {2025}
}

@article{wanyan2025look,
  title   = {Look Before You Leap: A GUI-Critic-R1 Model for Pre-Operative Error Diagnosis in GUI Automation},
  author  = {Wanyan, Yuyang and Zhang, Xi and Xu, Haiyang and Liu, Haowei and Wang, Junyang and Ye, Jiabo and Kou, Yutong and Yan, Ming and Huang, Fei and Yang, Xiaoshan and others},
  journal = {arXiv preprint arXiv:2506.04614},
  year    = {2025}
}


@article{poesia2024learning,
  title   = {Learning formal mathematics from intrinsic motivation},
  author  = {Poesia, Gabriel and Broman, David and Haber, Nick and Goodman, Noah},
  journal = {Advances in Neural Information Processing Systems},
  volume  = {37},
  pages   = {43032--43057},
  year    = {2024}
}


@article{yeo2025demystifying,
  title   = {Demystifying long chain-of-thought reasoning in llms},
  author  = {Yeo, Edward and Tong, Yuxuan and Niu, Morry and Neubig, Graham and Yue, Xiang},
  journal = {arXiv preprint arXiv:2502.03373},
  year    = {2025}
}


@misc{openr1,
  title  = {Open R1: A fully open reproduction of DeepSeek-R1},
  url    = {https://github.com/huggingface/open-r1},
  author = {{Hugging Face}},
  month  = {January},
  year   = {2025}
}

@article{huang2025r,
  title   = {R-Zero: Self-Evolving Reasoning LLM from Zero Data},
  author  = {Huang, Chengsong and Yu, Wenhao and Wang, Xiaoyang and Zhang, Hongming and Li, Zongxia and Li, Ruosen and Huang, Jiaxin and Mi, Haitao and Yu, Dong},
  journal = {arXiv preprint arXiv:2508.05004},
  year    = {2025}
}

@article{pang2023language,
  title   = {Language model self-improvement by reinforcement learning contemplation},
  author  = {Pang, Jing-Cheng and Wang, Pengyuan and Li, Kaiyuan and Chen, Xiong-Hui and Xu, Jiacheng and Zhang, Zongzhang and Yu, Yang},
  journal = {arXiv preprint arXiv:2305.14483},
  year    = {2023}
}

@article{yang2025ssr,
  title   = {SSR-Zero: Simple Self-Rewarding Reinforcement Learning for Machine Translation},
  author  = {Yang, Wenjie and Zheng, Mao and Song, Mingyang and Li, Zheng and Wang, Sitong},
  journal = {arXiv preprint arXiv:2505.16637},
  year    = {2025}
}

@article{chen2024self,
  title   = {Self-play fine-tuning converts weak language models to strong language models},
  author  = {Chen, Zixiang and Deng, Yihe and Yuan, Huizhuo and Ji, Kaixuan and Gu, Quanquan},
  journal = {arXiv preprint arXiv:2401.01335},
  year    = {2024}
}

@article{zhou2024calibrated,
  title   = {Calibrated self-rewarding vision language models},
  author  = {Zhou, Yiyang and Fan, Zhiyuan and Cheng, Dongjie and Yang, Sihan and Chen, Zhaorun and Cui, Chenhang and Wang, Xiyao and Li, Yun and Zhang, Linjun and Yao, Huaxiu},
  journal = {Advances in Neural Information Processing Systems},
  volume  = {37},
  pages   = {51503--51531},
  year    = {2024}
}

@article{zou2025trans,
  title   = {Trans-Zero: Self-Play Incentivizes Large Language Models for Multilingual Translation Without Parallel Data},
  author  = {Zou, Wei and Yang, Sen and Bao, Yu and Huang, Shujian and Chen, Jiajun and Cheng, Shanbo},
  journal = {arXiv preprint arXiv:2504.14669},
  year    = {2025}
}

@article{xin2025surrogate,
  title   = {Surrogate Signals from Format and Length: Reinforcement Learning for Solving Mathematical Problems without Ground Truth Answers},
  author  = {Xin, Rihui and Liu, Han and Wang, Zecheng and Zhang, Yupeng and Sui, Dianbo and Hu, Xiaolin and Wang, Bingning},
  journal = {arXiv preprint arXiv:2505.19439},
  year    = {2025}
}

@article{gandhi2025cognitive,
  title   = {Cognitive behaviors that enable self-improving reasoners, or, four habits of highly effective stars},
  author  = {Gandhi, Kanishk and Chakravarthy, Ayush and Singh, Anikait and Lile, Nathan and Goodman, Noah D},
  journal = {arXiv preprint arXiv:2503.01307},
  year    = {2025}
}


@article{zweiger2025self,
  title   = {Self-Adapting Language Models},
  author  = {Zweiger, Adam and Pari, Jyothish and Guo, Han and Aky{\"u}rek, Ekin and Kim, Yoon and Agrawal, Pulkit},
  journal = {arXiv preprint arXiv:2506.10943},
  year    = {2025}
}

@article{zuo2025ttrl,
  title   = {Ttrl: Test-time reinforcement learning},
  author  = {Zuo, Yuxin and Zhang, Kaiyan and Sheng, Li and Qu, Shang and Cui, Ganqu and Zhu, Xuekai and Li, Haozhan and Zhang, Yuchen and Long, Xinwei and Hua, Ermo and others},
  journal = {arXiv preprint arXiv:2504.16084},
  year    = {2025}
}

@article{wang2025reinforcement,
  title   = {Reinforcement learning for reasoning in large language models with one training example},
  author  = {Wang, Yiping and Yang, Qing and Zeng, Zhiyuan and Ren, Liliang and Liu, Liyuan and Peng, Baolin and Cheng, Hao and He, Xuehai and Wang, Kuan and Gao, Jianfeng and others},
  journal = {arXiv preprint arXiv:2504.20571},
  year    = {2025}
}

@article{zhang2025right,
  title   = {Right question is already half the answer: Fully unsupervised llm reasoning incentivization},
  author  = {Zhang, Qingyang and Wu, Haitao and Zhang, Changqing and Zhao, Peilin and Bian, Yatao},
  journal = {arXiv preprint arXiv:2504.05812},
  year    = {2025}
}

@article{zhao2025learning,
  title   = {Learning to reason without external rewards},
  author  = {Zhao, Xuandong and Kang, Zhewei and Feng, Aosong and Levine, Sergey and Song, Dawn},
  journal = {arXiv preprint arXiv:2505.19590},
  year    = {2025}
}

@article{cui2025entropy,
  title   = {The entropy mechanism of reinforcement learning for reasoning language models},
  author  = {Cui, Ganqu and Zhang, Yuchen and Chen, Jiacheng and Yuan, Lifan and Wang, Zhi and Zuo, Yuxin and Li, Haozhan and Fan, Yuchen and Chen, Huayu and Chen, Weize and others},
  journal = {arXiv preprint arXiv:2505.22617},
  year    = {2025}
}

@article{fang2025serl,
  title   = {SeRL: Self-Play Reinforcement Learning for Large Language Models with Limited Data},
  author  = {Fang, Wenkai and Liu, Shunyu and Zhou, Yang and Zhang, Kongcheng and Zheng, Tongya and Chen, Kaixuan and Song, Mingli and Tao, Dacheng},
  journal = {arXiv preprint arXiv:2505.20347},
  year    = {2025}
}

@article{zhang2025consistent,
  title   = {Consistent Paths Lead to Truth: Self-Rewarding Reinforcement Learning for LLM Reasoning},
  author  = {Zhang, Kongcheng and Yao, Qi and Liu, Shunyu and Wang, Yingjie and Lai, Baisheng and Ye, Jieping and Song, Mingli and Tao, Dacheng},
  journal = {arXiv preprint arXiv:2506.08745},
  year    = {2025}
}

@article{prabhudesai2025maximizing,
  title   = {Maximizing Confidence Alone Improves Reasoning},
  author  = {Prabhudesai, Mihir and Chen, Lili and Ippoliti, Alex and Fragkiadaki, Katerina and Liu, Hao and Pathak, Deepak},
  journal = {arXiv preprint arXiv:2505.22660},
  year    = {2025}
}

@inproceedings{lee2013pseudo,
  title        = {Pseudo-label: The simple and efficient semi-supervised learning method for deep neural networks},
  author       = {Lee, Dong-Hyun and others},
  booktitle    = {Workshop on challenges in representation learning, ICML},
  volume       = {3},
  pages        = {896},
  year         = {2013},
  organization = {Atlanta}
}

@article{shumailov2023curse,
  title   = {The curse of recursion: Training on generated data makes models forget},
  author  = {Shumailov, Ilia and Shumaylov, Zakhar and Zhao, Yiren and Gal, Yarin and Papernot, Nicolas and Anderson, Ross},
  journal = {arXiv preprint arXiv:2305.17493},
  year    = {2023}
}


@article{yuan2024self,
  title   = {Self-rewarding language models},
  author  = {Yuan, Weizhe and Pang, Richard Yuanzhe and Cho, Kyunghyun and Sukhbaatar, Sainbayar and Xu, Jing and Weston, Jason},
  journal = {arXiv preprint arXiv:2401.10020},
  volume  = {3},
  year    = {2024}
}

@article{wu2024meta,
  title   = {Meta-rewarding language models: Self-improving alignment with llm-as-a-meta-judge},
  author  = {Wu, Tianhao and Yuan, Weizhe and Golovneva, Olga and Xu, Jing and Tian, Yuandong and Jiao, Jiantao and Weston, Jason and Sukhbaatar, Sainbayar},
  journal = {arXiv preprint arXiv:2407.19594},
  year    = {2024}
}

@article{zhang2025process,
  title   = {Process-based self-rewarding language models},
  author  = {Zhang, Shimao and Liu, Xiao and Zhang, Xin and Liu, Junxiao and Luo, Zheheng and Huang, Shujian and Gong, Yeyun},
  journal = {arXiv preprint arXiv:2503.03746},
  year    = {2025}
}



@article{shao2025spurious,
  title   = {Spurious Rewards: Rethinking Training Signals in RLVR},
  author  = {Shao, Rulin and Li, Shuyue Stella and Xin, Rui and Geng, Scott and Wang, Yiping and Oh, Sewoong and Du, Simon Shaolei and Lambert, Nathan and Min, Sewon and Krishna, Ranjay and others},
  journal = {arXiv preprint arXiv:2506.10947},
  year    = {2025}
}

@article{wei2025unsupervised,
  title   = {Unsupervised Post-Training for Multi-Modal LLM Reasoning via GRPO},
  author  = {Wei, Lai and Li, Yuting and Wang, Chen and Wang, Yue and Kong, Linghe and Huang, Weiran and Sun, Lichao},
  journal = {arXiv preprint arXiv:2505.22453},
  year    = {2025}
}

@article{li2025confidence,
  title   = {Confidence Is All You Need: Few-Shot RL Fine-Tuning of Language Models},
  author  = {Li, Pengyi and Skripkin, Matvey and Zubrey, Alexander and Kuznetsov, Andrey and Oseledets, Ivan},
  journal = {arXiv preprint arXiv:2506.06395},
  year    = {2025}
}

@article{shafayat2025can,
  title   = {Can Large Reasoning Models Self-Train?},
  author  = {Shafayat, Sheikh and Tajwar, Fahim and Salakhutdinov, Ruslan and Schneider, Jeff and Zanette, Andrea},
  journal = {arXiv preprint arXiv:2505.21444},
  year    = {2025}
}

@article{dong2025reinforcement,
  title   = {Reinforcement Pre-Training},
  author  = {Dong, Qingxiu and Dong, Li and Tang, Yao and Ye, Tianzhu and Sun, Yutao and Sui, Zhifang and Wei, Furu},
  journal = {arXiv preprint arXiv:2506.08007},
  year    = {2025}
}

@article{zhang2025no,
  title   = {No Free Lunch: Rethinking Internal Feedback for LLM Reasoning},
  author  = {Zhang, Yanzhi and Zhang, Zhaoxi and Guan, Haoxiang and Cheng, Yilin and Duan, Yitong and Wang, Chen and Wang, Yue and Zheng, Shuxin and He, Jiyan},
  journal = {arXiv preprint arXiv:2506.17219},
  year    = {2025}
}

@article{xu2025genius,
  title   = {Genius: A generalizable and purely unsupervised self-training framework for advanced reasoning},
  author  = {Xu, Fangzhi and Yan, Hang and Ma, Chang and Zhao, Haiteng and Sun, Qiushi and Cheng, Kanzhi and He, Junxian and Liu, Jun and Wu, Zhiyong},
  journal = {arXiv preprint arXiv:2504.08672},
  year    = {2025}
}

@article{agarwal2025unreasonable,
  title   = {The unreasonable effectiveness of entropy minimization in llm reasoning},
  author  = {Agarwal, Shivam and Zhang, Zimin and Yuan, Lifan and Han, Jiawei and Peng, Hao},
  journal = {arXiv preprint arXiv:2505.15134},
  year    = {2025}
}

@article{eysenbach2018diversity,
  title   = {Diversity is all you need: Learning skills without a reward function},
  author  = {Eysenbach, Benjamin and Gupta, Abhishek and Ibarz, Julian and Levine, Sergey},
  journal = {arXiv preprint arXiv:1802.06070},
  year    = {2018}
}

@article{kim2023variational,
  title   = {Variational curriculum reinforcement learning for unsupervised discovery of skills},
  author  = {Kim, Seongun and Lee, Kyowoon and Choi, Jaesik},
  journal = {arXiv preprint arXiv:2310.19424},
  year    = {2023}
}

@article{krishnan2020improving,
  title   = {Improving model calibration with accuracy versus uncertainty optimization},
  author  = {Krishnan, Ranganath and Tickoo, Omesh},
  journal = {Advances in Neural Information Processing Systems},
  volume  = {33},
  pages   = {18237--18248},
  year    = {2020}
}

@article{grandvalet2004semi,
  title   = {Semi-supervised learning by entropy minimization},
  author  = {Grandvalet, Yves and Bengio, Yoshua},
  journal = {Advances in neural information processing systems},
  volume  = {17},
  year    = {2004}
}


@article{wang2020tent,
  title   = {Tent: Fully test-time adaptation by entropy minimization},
  author  = {Wang, Dequan and Shelhamer, Evan and Liu, Shaoteng and Olshausen, Bruno and Darrell, Trevor},
  journal = {arXiv preprint arXiv:2006.10726},
  year    = {2020}
}

@article{zhang2024come,
  title   = {COME: Test-time adaption by Conservatively Minimizing Entropy},
  author  = {Zhang, Qingyang and Bian, Yatao and Kong, Xinke and Zhao, Peilin and Zhang, Changqing},
  journal = {arXiv preprint arXiv:2410.10894},
  year    = {2024}
}

@article{huang2024self,
  title   = {Self-Improvement in Language Models: The Sharpening Mechanism},
  author  = {Huang, Audrey and Block, Adam and Foster, Dylan J and Rohatgi, Dhruv and Zhang, Cyril and Simchowitz, Max and Ash, Jordan T and Krishnamurthy, Akshay},
  journal = {arXiv preprint arXiv:2412.01951},
  year    = {2024}
}

@inproceedings{chapelle2005semi,
  title        = {Semi-supervised classification by low density separation},
  author       = {Chapelle, Olivier and Zien, Alexander},
  booktitle    = {International workshop on artificial intelligence and statistics},
  pages        = {57--64},
  year         = {2005},
  organization = {PMLR}
}

@article{bai2022constitutional,
  title   = {Constitutional ai: Harmlessness from ai feedback},
  author  = {Bai, Yuntao and Kadavath, Saurav and Kundu, Sandipan and Askell, Amanda and Kernion, Jackson and Jones, Andy and Chen, Anna and Goldie, Anna and Mirhoseini, Azalia and McKinnon, Cameron and others},
  journal = {arXiv preprint arXiv:2212.08073},
  year    = {2022}
}

@article{chen2025seed,
  title   = {Seed-grpo: Semantic entropy enhanced grpo for uncertainty-aware policy optimization},
  author  = {Chen, Minghan and Chen, Guikun and Wang, Wenguan and Yang, Yi},
  journal = {arXiv preprint arXiv:2505.12346},
  year    = {2025}
}

@article{cheng2025reasoning,
  title   = {Reasoning with Exploration: An Entropy Perspective},
  author  = {Cheng, Daixuan and Huang, Shaohan and Zhu, Xuekai and Dai, Bo and Zhao, Wayne Xin and Zhang, Zhenliang and Wei, Furu},
  journal = {arXiv preprint arXiv:2506.14758},
  year    = {2025}
}

@article{zhao2025absolute,
  title   = {Absolute zero: Reinforced self-play reasoning with zero data},
  author  = {Zhao, Andrew and Wu, Yiran and Yue, Yang and Wu, Tong and Xu, Quentin and Lin, Matthieu and Wang, Shenzhi and Wu, Qingyun and Zheng, Zilong and Huang, Gao},
  journal = {arXiv preprint arXiv:2505.03335},
  year    = {2025}
}


@article{press2024entropy,
  title   = {The entropy enigma: Success and failure of entropy minimization},
  author  = {Press, Ori and Shwartz-Ziv, Ravid and LeCun, Yann and Bethge, Matthias},
  journal = {arXiv preprint arXiv:2405.05012},
  year    = {2024}
}

@article{lv2025climb,
  title   = {The Climb Carves Wisdom Deeper Than the Summit: On the Noisy Rewards in Learning to Reason},
  author  = {Lv, Ang and Xie, Ruobing and Sun, Xingwu and Kang, Zhanhui and Yan, Rui},
  journal = {arXiv preprint arXiv:2505.22653},
  year    = {2025}
}


## Reward Design - Credit Assignment
@article{uesato2022solving,
  title   = {Solving math word problems with process-and outcome-based feedback},
  author  = {Uesato, Jonathan and Kushman, Nate and Kumar, Ramana and Song, Francis and Siegel, Noah and Wang, Lisa and Creswell, Antonia and Irving, Geoffrey and Higgins, Irina},
  journal = {arXiv preprint arXiv:2211.14275},
  year    = {2022}
}

@inproceedings{lightman2023let,
  title     = {Let's Verify Step by Step},
  author    = {Hunter Lightman and Vineet Kosaraju and Yuri Burda and Harrison Edwards and Bowen Baker and Teddy Lee and Jan Leike and John Schulman and Ilya Sutskever and Karl Cobbe},
  booktitle = {The Twelfth International Conference on Learning Representations},
  year      = {2024},
  url       = {https://openreview.net/forum?id=v8L0pN6EOi}
}

@inproceedings{wang2023math,
  title     = {Math-Shepherd: Verify and Reinforce {LLM}s Step-by-step without Human Annotations},
  author    = {Wang, Peiyi  and
               Li, Lei  and
               Shao, Zhihong  and
               Xu, Runxin  and
               Dai, Damai  and
               Li, Yifei  and
               Chen, Deli  and
               Wu, Yu  and
               Sui, Zhifang},
  editor    = {Ku, Lun-Wei  and
               Martins, Andre  and
               Srikumar, Vivek},
  booktitle = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = aug,
  year      = {2024},
  address   = {Bangkok, Thailand},
  publisher = {Association for Computational Linguistics},
  url       = {https://aclanthology.org/2024.acl-long.510/},
  doi       = {10.18653/v1/2024.acl-long.510},
  pages     = {9426--9439}
}

@article{rafailov2023direct,
  title   = {Direct preference optimization: Your language model is secretly a reward model},
  author  = {Rafailov, Rafael and Sharma, Archit and Mitchell, Eric and Manning, Christopher D and Ermon, Stefano and Finn, Chelsea},
  journal = {Advances in neural information processing systems},
  volume  = {36},
  pages   = {53728--53741},
  year    = {2023}
}

@inproceedings{rafailov2024r,
  title     = {From r to Q star: Your Language Model is Secretly a Q-Function},
  author    = {Rafael Rafailov and Joey Hejna and Ryan Park and Chelsea Finn},
  booktitle = {First Conference on Language Modeling},
  year      = {2024},
  url       = {https://openreview.net/forum?id=kEVcNxtqXk}
}

@inproceedings{yuan2024free,
  title     = {Free Process Rewards without Process Labels},
  author    = {Lifan Yuan and Wendi Li and Huayu Chen and Ganqu Cui and Ning Ding and Kaiyan Zhang and Bowen Zhou and Zhiyuan Liu and Hao Peng},
  booktitle = {Forty-second International Conference on Machine Learning},
  year      = {2025},
  url       = {https://openreview.net/forum?id=8ThnPFhGm8}
}

@article{zhang2025lessons,
  title   = {The Lessons of Developing Process Reward Models in Mathematical Reasoning},
  author  = {Zhenru Zhang and Chujie Zheng and Yangzhen Wu and Beichen Zhang and Runji Lin and Bowen Yu and Dayiheng Liu and Jingren Zhou and Junyang Lin},
  journal = {arXiv preprint arXiv:2501.07301},
  year    = {2025}
}

@inproceedings{hadfield2017inverse,
  author    = {Hadfield-Menell, Dylan and Milli, Smitha and Abbeel, Pieter and Russell, Stuart J and Dragan, Anca},
  booktitle = {Advances in Neural Information Processing Systems},
  editor    = {I. Guyon and U. Von Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett},
  pages     = {},
  publisher = {Curran Associates, Inc.},
  title     = {Inverse Reward Design},
  url       = {https://proceedings.neurips.cc/paper_files/paper/2017/file/32fdab6559cdfa4f167f8c31b9199643-Paper.pdf},
  volume    = {30},
  year      = {2017}
}

@article{schrittwieser2020mastering,
  title     = {Mastering atari, go, chess and shogi by planning with a learned model},
  author    = {Schrittwieser, Julian and Antonoglou, Ioannis and Hubert, Thomas and Simonyan, Karen and Sifre, Laurent and Schmitt, Simon and Guez, Arthur and Lockhart, Edward and Hassabis, Demis and Graepel, Thore and others},
  journal   = {Nature},
  volume    = {588},
  number    = {7839},
  pages     = {604--609},
  year      = {2020},
  publisher = {Nature Publishing Group UK London}
}

@article{silver2018general,
  title     = {A general reinforcement learning algorithm that masters chess, shogi, and Go through self-play},
  author    = {Silver, David and Hubert, Thomas and Schrittwieser, Julian and Antonoglou, Ioannis and Lai, Matthew and Guez, Arthur and Lanctot, Marc and Sifre, Laurent and Kumaran, Dharshan and Graepel, Thore and others},
  journal   = {Science},
  volume    = {362},
  number    = {6419},
  pages     = {1140--1144},
  year      = {2018},
  publisher = {American Association for the Advancement of Science}
}

@inproceedings{arulkumaran2019alphastar,
  title     = {Alphastar: An evolutionary computation perspective},
  author    = {Arulkumaran, Kai and Cully, Antoine and Togelius, Julian},
  booktitle = {Proceedings of the genetic and evolutionary computation conference companion},
  pages     = {314--315},
  year      = {2019}
}

@article{silver2017mastering,
  title   = {Mastering chess and shogi by self-play with a general reinforcement learning algorithm},
  author  = {Silver, David and Hubert, Thomas and Schrittwieser, Julian and Antonoglou, Ioannis and Lai, Matthew and Guez, Arthur and Lanctot, Marc and Sifre, Laurent and Kumaran, Dharshan and Graepel, Thore and others},
  journal = {arXiv preprint arXiv:1712.01815},
  year    = {2017}
}

@article{silver2016mastering,
  title     = {Mastering the game of Go with deep neural networks and tree search},
  author    = {Silver, David and Huang, Aja and Maddison, Chris J and Guez, Arthur and Sifre, Laurent and Van Den Driessche, George and Schrittwieser, Julian and Antonoglou, Ioannis and Panneershelvam, Veda and Lanctot, Marc and others},
  journal   = {nature},
  volume    = {529},
  number    = {7587},
  pages     = {484--489},
  year      = {2016},
  publisher = {Nature Publishing Group}
}

@inproceedings{liu2022meta,
  author    = {Liu, Runze and Bai, Fengshuo and Du, Yali and Yang, Yaodong},
  booktitle = {Advances in Neural Information Processing Systems},
  editor    = {S. Koyejo and S. Mohamed and A. Agarwal and D. Belgrave and K. Cho and A. Oh},
  pages     = {22270--22284},
  publisher = {Curran Associates, Inc.},
  title     = {Meta-Reward-Net: Implicitly Differentiable Reward Learning for Preference-based Reinforcement Learning},
  url       = {https://proceedings.neurips.cc/paper_files/paper/2022/file/8be9c134bb193d8bd3827d4df8488228-Paper-Conference.pdf},
  volume    = {35},
  year      = {2022}
}

@article{sun2025large,
  title    = {A large language model-driven reward design framework via dynamic feedback for reinforcement learning},
  journal  = {Knowledge-Based Systems},
  volume   = {326},
  pages    = {114065},
  year     = {2025},
  issn     = {0950-7051},
  doi      = {https://doi.org/10.1016/j.knosys.2025.114065},
  url      = {https://www.sciencedirect.com/science/article/pii/S0950705125011104},
  author   = {Shengjie Sun and Runze Liu and Jiafei Lyu and Jing-Wen Yang and Liangpeng Zhang and Xiu Li},
  keywords = {Reinforcement learning, Large language model, Reward design}
}

@article{liu2025can1b,
  title   = {Can 1b llm surpass 405b llm? rethinking compute-optimal test-time scaling},
  author  = {Liu, Runze and Gao, Junqi and Zhao, Jian and Zhang, Kaiyan and Li, Xiu and Qi, Biqing and Ouyang, Wanli and Zhou, Bowen},
  journal = {arXiv preprint arXiv:2502.06703},
  year    = {2025}
}

@article{zou2025reasonflux,
  title   = {ReasonFlux-PRM: Trajectory-Aware PRMs for Long Chain-of-Thought Reasoning in LLMs},
  author  = {Zou, Jiaru and Yang, Ling and Gu, Jingwen and Qiu, Jiahao and Shen, Ke and He, Jingrui and Wang, Mengdi},
  journal = {arXiv preprint arXiv:2506.18896},
  year    = {2025}
}

@article{dong2025tool,
  title   = {Tool-Star: Empowering LLM-Brained Multi-Tool Reasoner via Reinforcement Learning},
  author  = {Dong, Guanting and Chen, Yifei and Li, Xiaoxi and Jin, Jiajie and Qian, Hongjin and Zhu, Yutao and Mao, Hangyu and Zhou, Guorui and Dou, Zhicheng and Wen, Ji-Rong},
  journal = {arXiv preprint arXiv:2505.16410},
  year    = {2025}
}

@article{wang2025stepsearch,
  title   = {StepSearch: Igniting LLMs Search Ability via Step-Wise Proximal Policy Optimization},
  author  = {Wang, Ziliang and Zheng, Xuhui and An, Kang and Ouyang, Cijun and Cai, Jialu and Wang, Yuhang and Wu, Yichao},
  journal = {arXiv preprint arXiv:2505.15107},
  year    = {2025}
}


@article{zheng2024processbench,
  title   = {ProcessBench: Identifying Process Errors in Mathematical Reasoning},
  author  = {Zheng, Chujie and Zhang, Zhenru and Zhang, Beichen and Lin, Runji and Lu, Keming and Yu, Bowen and Liu, Dayiheng and Zhou, Jingren and Lin, Junyang},
  journal = {arXiv preprint arXiv:2412.06559},
  year    = {2024}
}


@inproceedings{zhang2025openprm,
  title     = {Open{PRM}: Building Open-domain Process-based Reward Models with Preference Trees},
  author    = {Kaiyan Zhang and Jiayuan Zhang and Haoxin Li and Xuekai Zhu and Ermo Hua and Xingtai Lv and Ning Ding and Biqing Qi and Bowen Zhou},
  booktitle = {The Thirteenth International Conference on Learning Representations},
  year      = {2025},
  url       = {https://openreview.net/forum?id=fGIqGfmgkW}
}

@article{cui2025process,
  title   = {Process Reinforcement through Implicit Rewards},
  author  = {Cui, Ganqu and Yuan, Lifan and Wang, Zefan and Wang, Hanbin and Li, Wendi and He, Bingxiang and Fan, Yuchen and Yu, Tianyu and Xu, Qixin and Chen, Weize and others},
  journal = {arXiv preprint arXiv:2502.01456},
  year    = {2025}
}

@inproceedings{kazemnejad2024vineppo,
  title     = {Vine{PPO}: Refining Credit Assignment in {RL} Training of {LLM}s},
  author    = {Amirhossein Kazemnejad and Milad Aghajohari and Eva Portelance and Alessandro Sordoni and Siva Reddy and Aaron Courville and Nicolas Le Roux},
  booktitle = {Forty-second International Conference on Machine Learning},
  year      = {2025},
  url       = {https://openreview.net/forum?id=Myx2kJFzAn}
}

@inproceedings{qu2025optimizing,
  title     = {Optimizing Test-Time Compute via Meta Reinforcement Finetuning},
  author    = {Yuxiao Qu and Matthew Y. R. Yang and Amrith Setlur and Lewis Tunstall and Edward Emanuel Beeching and Ruslan Salakhutdinov and Aviral Kumar},
  booktitle = {Forty-second International Conference on Machine Learning},
  year      = {2025},
  url       = {https://openreview.net/forum?id=TqODUDsU4u}
}

@article{cheng2025stop,
  title   = {Stop Summation: Min-Form Credit Assignment Is All Process Reward Model Needs for Reasoning},
  author  = {Cheng, Jie and Qiao, Ruixi and Li, Lijun and Guo, Chao and Wang, Junle and Xiong, Gang and Lv, Yisheng and Wang, Fei-Yue},
  journal = {arXiv preprint arXiv:2504.15275},
  year    = {2025}
}

@article{dai2025s,
  title   = {S-GRPO: Early Exit via Reinforcement Learning in Reasoning Models},
  author  = {Dai, Muzhi and Yang, Chenxu and Si, Qingyi},
  journal = {arXiv preprint arXiv:2505.07686},
  year    = {2025}
}

@article{chen2025stepwise,
  title={Stepwise Guided Policy Optimization: Coloring your Incorrect Reasoning in GRPO},
  author={Chen, Peter and Li, Xiaopeng and Li, Ziniu and ChenD, Xi and Lin, Tianyi},
  journal = {arXiv preprint arXiv:2505.11595},
  year={2025}
}

@article{wang2025spa,
  title   = {SPA-RL: Reinforcing LLM Agents via Stepwise Progress Attribution},
  author  = {Wang, Hanlin and Leong, Chak Tou and Wang, Jiashuo and Wang, Jian and Li, Wenjie},
  journal = {arXiv preprint arXiv:2505.20732},
  year    = {2025}
}

@article{guo2025segment,
  title   = {Segment Policy Optimization: Effective Segment-Level Credit Assignment in RL for Large Language Models},
  author  = {Guo, Yiran and Xu, Lijie and Liu, Jie and Ye, Dan and Qiu, Shuang},
  journal = {arXiv preprint arXiv:2505.23564},
  year    = {2025}
}

@article{yang2025treerpo,
  title   = {TreeRPO: Tree Relative Policy Optimization},
  author  = {Yang, Zhicheng and Guo, Zhijiang and Huang, Yinya and Liang, Xiaodan and Wang, Yiwei and Tang, Jing},
  journal = {arXiv preprint arXiv:2506.05183},
  year    = {2025}
}

@article{hou2025treerl,
  title   = {TreeRL: LLM Reinforcement Learning with On-Policy Tree Search},
  author  = {Hou, Zhenyu and Hu, Ziniu and Li, Yujiang and Lu, Rui and Tang, Jie and Dong, Yuxiao},
  journal = {arXiv preprint arXiv:2506.11902},
  year    = {2025}
}

@article{fei2025self,
  title   = {Self-Guided Process Reward Optimization with Redefined Step-wise Advantage for Process Reinforcement Learning},
  author  = {Fei, Wu and Kong, Hao and Liang, Shuxian and Lin, Yang and Yang, Yibo and Tang, Jing and Chen, Lei and Hua, Xiansheng},
  journal = {arXiv preprint arXiv:2507.01551},
  year    = {2025}
}

@article{zheng2025first,
  title   = {First Return, Entropy-Eliciting Explore},
  author  = {Zheng, Tianyu and Xing, Tianshun and Gu, Qingshui and Liang, Taoran and Qu, Xingwei and Zhou, Xin and Li, Yizhi and Wen, Zhoufutu and Lin, Chenghua and Huang, Wenhao and others},
  journal = {arXiv preprint arXiv:2507.07017},
  year    = {2025}
}

@article{he2025good,
  title   = {Good Learners Think Their Thinking: Generative PRM Makes Large Reasoning Model More Efficient Math Learner},
  author  = {He, Tao and Mu, Rongchuan and Liao, Lizi and Cao, Yixin and Liu, Ming and Qin, Bing},
  journal = {arXiv preprint arXiv:2507.23317},
  year    = {2025}
}

@article{tan2025gtpo,
  title   = {GTPO and GRPO-S: Token and Sequence-Level Reward Shaping with Policy Entropy},
  author  = {Tan, Hongze and Pan, Jianfei},
  journal = {arXiv preprint arXiv:2508.04349},
  year    = {2025}
}

@article{yue2025promoting,
  title   = {Promoting Efficient Reasoning with Verifiable Stepwise Reward},
  author  = {Chuhuai Yue and Chengqi Dong and Yinan Gao and Hang He and Jiajun Chai and Guojun Yin and Wei Lin},
  journal = {arXiv preprint arXiv:2508.10293},
  year    = {2025}
}

@article{jin2025your,
  title   = {Your Reward Function for RL is Your Best PRM for Search: Unifying RL and Search-Based TTS},
  author  = {Jin, Can and Zhou, Yang and Zhang, Qixin and Peng, Hongwu and Zhang, Di and Pavone, Marco and Han, Ligong and Hong, Zhang-Wei and Che, Tong and Metaxas, Dimitris N},
  journal = {arXiv preprint arXiv:2508.14313},
  year    = {2025}
}

@article{li2025treepo,
  title   = {TreePO: Bridging the Gap of Policy Optimization and Efficacy and Inference Efficiency with Heuristic Tree-based Modeling},
  author  = {Li, Yizhi and Gu, Qingshui and Wen, Zhoufutu and Li, Ziniu and Xing, Tianshun and Guo, Shuyue and Zheng, Tianyu and Zhou, Xin and Qu, Xingwei and Zhou, Wangchunshu and Zhang, Zheng and Shen, Wei and Liu, Qian and Lin, Chenghua and Yang, Jian and Zhang, Ge and Huang, Wenhao},
  journal = {arXiv preprint arXiv:2508.17445},
  year    = {2025}
}

@article{ye2025beyond,
  title   = {Beyond Correctness: Harmonizing Process and Outcome Rewards through RL Training},
  author  = {Ye, Chenlu and Yu, Zhou and Zhang, Ziji and Chen, Hao and Sadagopan, Narayanan and Huang, Jing and Zhang, Tong and Beniwal, Anurag},
  journal = {arXiv preprint arXiv:2509.03403},
  year    = {2025}
}

@article{wang2025emergent,
  title   = {Emergent Hierarchical Reasoning in LLMs through Reinforcement Learning},
  author  = {Wang, Haozhe and Xu, Qixin and Liu, Che and Wu, Junhong and Lin, Fangzhen and Chen, Wenhu},
  journal = {arXiv preprint arXiv:2509.03646},
  year    = {2025}
}

@article{liu2025attention,
  title   = {Attention as a Compass: Efficient Exploration for Process-Supervised RL in Reasoning Models},
  author  = {{Liu}, Runze and Wang, Jiakang and Shi, Yuling and Xie, Zhihui and An, Chenxin and Zhang, Kaiyan and Zhao, Jian and Gu, Xiaodong and Lin, Lei and Hu, Wenping and Li, Xiu and Zhang, Fuzheng and Zhou, Guorui and Gai, Kun},
  journal = {arXiv preprint arXiv:2509.26628},
  year    = {2025}
}

## Reward: Turn Level
@article{sun2025stabilizing,
  title   = {Stabilizing Long-term Multi-turn Reinforcement Learning with Gated Rewards},
  author  = {Sun, Zetian and Li, Dongfang and Chen, Zhuoen and Qin, Yuhuai and Hu, Baotian},
  journal = {arXiv preprint arXiv:2508.10548},
  year    = {2025}
}

@article{zhao2025mua,
  title   = {MUA-RL: Multi-turn User-interacting Agent Reinforcement Learning for agentic tool use},
  author  = {Zhao, Weikang and Wang, Xili and Ma, Chengdi and Kong, Lingbin and Yang, Zhaohua and Tuo, Mingxiang and Shi, Xiaowei and Zhai, Yitao and Cai, Xunliang},
  journal = {arXiv preprint arXiv:2508.18669},
  year    = {2025}
}

@article{zhu2024emotion,
  title     = {An emotion-sensitive dialogue policy for task-oriented dialogue system},
  author    = {Zhu, Hui and Wang, Xv and Wang, Zhenyu and Xv, Kai},
  journal   = {Scientific Reports},
  volume    = {14},
  number    = {1},
  pages     = {19759},
  year      = {2024},
  publisher = {Nature Publishing Group UK London}
}

@article{lee2024improving,
  title   = {Improving Dialogue Agents by Decomposing One Global Explicit Annotation with Local Implicit Multimodal Feedback},
  author  = {Lee, Dong Won and Park, Hae Won and Kim, Yoon and Breazeal, Cynthia and Morency, Louis-Philippe},
  journal = {arXiv preprint arXiv:2403.11330},
  year    = {2024}
}

@article{zeng2025reinforcing,
  title   = {Reinforcing Multi-Turn Reasoning in LLM Agents via Turn-Level Credit Assignment},
  author  = {Zeng, Siliang and Wei, Quan and Brown, William and Frunza, Oana and Nevmyvaka, Yuriy and Hong, Mingyi},
  journal = {arXiv preprint arXiv:2505.11821},
  year    = {2025}
}

@article{gu2025mobile,
  title   = {Mobile-R1: Towards Interactive Reinforcement Learning for VLM-Based Mobile Agent via Task-Level Rewards},
  author  = {Gu, Jihao and Ai, Qihang and Wang, Yingyao and Bu, Pi and Xing, Jingxuan and Zhu, Zekun and Jiang, Wei and Wang, Ziming and Zhao, Yingxiu and Zhang, Ming-Liang and others},
  journal = {arXiv preprint arXiv:2506.20332},
  year    = {2025}
}

@article{zhou2024archer,
  title   = {Archer: Training language model agents via hierarchical multi-turn rl},
  author  = {Zhou, Yifei and Zanette, Andrea and Pan, Jiayi and Levine, Sergey and Kumar, Aviral},
  journal = {arXiv preprint arXiv:2402.19446},
  year    = {2024}
}

@article{lee2025aligning,
  title   = {Aligning Dialogue Agents with Global Feedback via Large Language Model Reward Decomposition},
  author  = {Lee, Dong Won and Park, Hae Won and Breazeal, Cynthia and Morency, Louis-Philippe},
  journal = {arXiv preprint arXiv:2505.15922},
  year    = {2025}
}

## Reward Design - Reward Shaping


## Reward Design - Generative Rewards

@article{kuba2025language,
  title={Language Self-Play For Data-Free Training},
  author={Kuba, Jakub Grudzien and Gu, Mengting and Ma, Qi and Tian, Yuandong and Mohan, Vijai},
  journal={arXiv preprint arXiv:2509.07414},
  year={2025}
}

@article{khalifa2025process,
  title={Process reward models that think},
  author={Khalifa, Muhammad and Agarwal, Rishabh and Logeswaran, Lajanugen and Kim, Jaekyeom and Peng, Hao and Lee, Moontae and Lee, Honglak and Wang, Lu},
  journal={arXiv preprint arXiv:2504.16828},
  year={2025}
}

@article{chen2025xverify,
  title   = {xverify: Efficient answer verifier for reasoning model evaluations},
  author  = {Chen, Ding and Yu, Qingchen and Wang, Pengyuan and Zhang, Wentao and Tang, Bo and Xiong, Feiyu and Li, Xinchi and Yang, Minchuan and Li, Zhiyu},
  journal = {arXiv preprint arXiv:2504.10481},
  year    = {2025}
}

@article{li2023generative,
  title   = {Generative Judge for Evaluating Alignment},
  author  = {Li, Junlong and Sun, Shichao and Yuan, Weizhe and Fan, Run-Ze and Zhao, Hai and Liu, Pengfei},
  journal = {arXiv preprint arXiv:2310.05470},
  year    = {2023}
}

@article{ankner2024critique,
  title   = {Critique-out-loud reward models},
  author  = {Ankner, Zachary and Paul, Mansheej and Cui, Brandon and Chang, Jonathan D and Ammanabrolu, Prithviraj},
  journal = {arXiv preprint arXiv:2408.11791},
  year    = {2024}
}

@article{bukharin2025adversarial,
  title   = {Adversarial training of reward models},
  author  = {Bukharin, Alexander and Qian, Haifeng and Sun, Shengyang and Renduchintala, Adithya and Singhal, Soumye and Wang, Zhilin and Kuchaiev, Oleksii and Delalleau, Olivier and Zhao, Tuo},
  journal = {arXiv preprint arXiv:2504.06141},
  year    = {2025}
}

@article{tripathi2025pairwise,
  title   = {Pairwise or Pointwise? Evaluating Feedback Protocols for Bias in LLM-Based Evaluation},
  author  = {Tripathi, Tuhina and Wadhwa, Manya and Durrett, Greg and Niekum, Scott},
  journal = {arXiv preprint arXiv:2504.14716},
  year    = {2025}
}

@article{hong2025think,
  title   = {Think-RM: Enabling Long-Horizon Reasoning in Generative Reward Models},
  author  = {Hong, Ilgee and Yu, Changlong and Qiu, Liang and Yan, Weixiang and Xu, Zhenghao and Jiang, Haoming and Zhang, Qingru and Lu, Qin and Liu, Xin and Zhang, Chao and others},
  journal = {arXiv preprint arXiv:2505.16265},
  year    = {2025}
}

@article{mahan2024generative,
  title   = {Generative reward models},
  author  = {Mahan, Dakota and Van Phung, Duy and Rafailov, Rafael and Blagden, Chase and Lile, Nathan and Castricato, Louis and Fr{\"a}nken, Jan-Philipp and Finn, Chelsea and Albalak, Alon},
  journal = {arXiv preprint arXiv:2410.12832},
  year    = {2024}
}

@article{guo2025reward,
  title   = {Reward reasoning model},
  author  = {Guo, Jiaxin and Chi, Zewen and Dong, Li and Dong, Qingxiu and Wu, Xun and Huang, Shaohan and Wei, Furu},
  journal = {arXiv preprint arXiv:2505.14674},
  year    = {2025}
}

@article{alazraki2025reverse,
  title   = {Reverse Engineering Human Preferences with Reinforcement Learning},
  author  = {Alazraki, Lisa and Yi-Chern, Tan and Campos, Jon Ander and Mozes, Maximilian and Rei, Marek and Bartolo, Max},
  journal = {arXiv preprint arXiv:2505.15795},
  year    = {2025}
}

@article{jia2025writing,
  title   = {Writing-Zero: Bridge the Gap Between Non-verifiable Tasks and Verifiable Rewards},
  author  = {Jia, Ruipeng and Yang, Yunyi and Gai, Yongbo and Luo, Kai and Huang, Shihao and Lin, Jianhe and Jiang, Xiaoxi and Jiang, Guanjun},
  journal = {arXiv e-prints},
  pages   = {arXiv--2506},
  year    = {2025}
}

@article{bensal2025reflect,
  title   = {Reflect, Retry, Reward: Self-Improving LLMs via Reinforcement Learning},
  author  = {Bensal, Shelly and Jamil, Umar and Bryant, Christopher and Russak, Melisa and Kamble, Kiran and Mozolevskyi, Dmytro and Ali, Muayad and AlShikh, Waseem},
  journal = {arXiv preprint arXiv:2505.24726},
  year    = {2025}
}

@article{viswanathan2025checklists,
  title   = {Checklists are better than reward models for aligning language models},
  author  = {Viswanathan, Vijay and Sun, Yanchao and Ma, Shuang and Kong, Xiang and Cao, Meng and Neubig, Graham and Wu, Tongshuang},
  journal = {arXiv preprint arXiv:2507.18624},
  year    = {2025}
}

@article{zhou2025libra,
  title   = {Libra: Assessing and Improving Reward Model by Learning to Think},
  author  = {Zhou, Meng and Li, Bei and Liu, Jiahao and Shi, Xiaowen and Bai, Yang and Weng, Rongxiang and Wang, Jingang and Cai, Xunliang},
  journal = {arXiv preprint arXiv:2507.21645},
  year    = {2025}
}

@article{zhang2025distill,
  title   = {Distill Not Only Data but Also Rewards: Can Smaller Language Models Surpass Larger Ones?},
  author  = {Zhang, Yudi and Wang, Lu and Fang, Meng and Du, Yali and Huang, Chenghua and Wang, Jun and Lin, Qingwei and Pechenizkiy, Mykola and Zhang, Dongmei and Rajmohan, Saravan and others},
  journal = {arXiv preprint arXiv:2502.19557},
  year    = {2025}
}

@article{xiong2025self,
  title   = {Self-rewarding correction for mathematical reasoning},
  author  = {Xiong, Wei and Zhang, Hanning and Ye, Chenlu and Chen, Lichang and Jiang, Nan and Zhang, Tong},
  journal = {arXiv preprint arXiv:2502.19613},
  year    = {2025}
}

@article{chen2025judgelrm,
  title   = {Judgelrm: Large reasoning models as a judge},
  author  = {Chen, Nuo and Hu, Zhiyuan and Zou, Qingyun and Wu, Jiaying and Wang, Qian and Hooi, Bryan and He, Bingsheng},
  journal = {arXiv preprint arXiv:2504.00050},
  year    = {2025}
}

@article{zhu2025charm,
  title   = {Charm: Calibrating reward models with chatbot arena scores},
  author  = {Zhu, Xiao and Tan, Chenmien and Chen, Pinzhen and Sennrich, Rico and Zhang, Yanlin and Hu, Hanxu},
  journal = {arXiv preprint arXiv:2504.10045},
  year    = {2025}
}

@article{chen2025establishing,
  title   = {Establishing reliability metrics for reward models in large language models},
  author  = {Chen, Yizhou and Liu, Yawen and Wang, Xuesi and Yu, Qingtao and Huzhang, Guangda and Zeng, Anxiang and Yu, Han and Zhou, Zhiming},
  journal = {arXiv preprint arXiv:2504.14838},
  year    = {2025}
}

@article{yang2025deepcritic,
  title   = {Deepcritic: Deliberate critique with large language models},
  author  = {Yang, Wenkai and Chen, Jingwen and Lin, Yankai and Wen, Ji-Rong},
  journal = {arXiv preprint arXiv:2505.00662},
  year    = {2025}
}

@article{liu2025nover,
  title   = {NOVER: Incentive Training for Language Models via Verifier-Free Reinforcement Learning},
  author  = {{Liu}, Wei and Qi, Siya and Wang, Xinyu and Qian, Chen and Du, Yali and He, Yulan},
  journal = {arXiv preprint arXiv:2505.16022},
  year    = {2025}
}

@article{jiang2025pag,
  title   = {PAG: Multi-Turn Reinforced LLM Self-Correction with Policy as Generative Verifier},
  author  = {Jiang, Yuhua and Xiong, Yuwen and Yuan, Yufeng and Xin, Chao and Xu, Wenyuan and Yue, Yu and Zhao, Qianchuan and Yan, Lin},
  journal = {arXiv preprint arXiv:2506.10406},
  year    = {2025}
}

@article{xu2025direct,
  title   = {Direct reasoning optimization: Llms can reward and refine their own reasoning for open-ended tasks},
  author  = {Xu, Yifei and Chakraborty, Tusher and Sharma, Srinagesh and Nunes, Leonardo and K{\i}c{\i}man, Emre and Lu, Songwu and Chandra, Ranveer},
  journal = {arXiv preprint arXiv:2506.13351},
  year    = {2025}
}

@article{wang2025gram,
  title   = {GRAM: A Generative Foundation Reward Model for Reward Generalization},
  author  = {Wang, Chenglong and Gan, Yang and Huo, Yifu and Mu, Yongyu and He, Qiaozhi and Yang, Murun and Li, Bei and Xiao, Tong and Zhang, Chunliang and Liu, Tongran and others},
  journal = {arXiv preprint arXiv:2506.14175},
  year    = {2025}
}

@article{guo2025general,
  title   = {From General to Targeted Rewards: Surpassing GPT-4 in Open-Ended Long-Context Generation},
  author  = {Guo, Zhihan and Wu, Jiele and Cui, Wenqian and Zhang, Yifei and Hu, Minda and Wang, Yufei and King, Irwin},
  journal = {arXiv preprint arXiv:2506.16024},
  year    = {2025}
}

@article{li2025omni,
  title   = {Omni-Think: Scaling Cross-Domain Generalization in LLMs via Multi-Task RL with Hybrid Rewards},
  author  = {Li, Derek and Zhou, Jiaming and Kazemi, Amirreza and Sun, Qianyi and Ghaddar, Abbas and Alomrani, Mohammad Ali and Ma, Liheng and Luo, Yu and Li, Dong and Wen, Feng and others},
  journal = {arXiv preprint arXiv:2507.14783},
  year    = {2025}
}

@article{gunjal2025rubrics,
  title   = {Rubrics as rewards: Reinforcement learning beyond verifiable domains},
  author  = {Gunjal, Anisha and Wang, Anthony and Lau, Elaine and Nath, Vaskar and Liu, Bing and Hendryx, Sean},
  journal = {arXiv preprint arXiv:2507.17746},
  year    = {2025}
}

@article{fei2025post,
  title   = {Post-Completion Learning for Language Models},
  author  = {Fei, Xiang and Wang, Siqi and Wei, Shu and Nie, Yuxiang and Shi, Wei and Feng, Hao and Huang, Can},
  journal = {arXiv preprint arXiv:2507.20252},
  year    = {2025}
}

@article{xie2025capo,
  title   = {CAPO: Towards Enhancing LLM Reasoning through Verifiable Generative Credit Assignment},
  author  = {Xie, Guofu and Shi, Yunsheng and Tian, Hongtao and Yao, Ting and Zhang, Xiao},
  journal = {arXiv preprint arXiv:2508.02298},
  year    = {2025}
}

@article{liu2025compassverifier,
  title   = {CompassVerifier: A Unified and Robust Verifier for LLMs Evaluation and Outcome Reward},
  author  = {{Liu}, Shudong and Liu, Hongwei and Liu, Junnan and Xiao, Linchen and Gao, Songyang and Lyu, Chengqi and Gu, Yuzhe and Zhang, Wenwei and Wong, Derek F and Zhang, Songyang and others},
  journal = {arXiv preprint arXiv:2508.03686},
  year    = {2025}
}

@article{hong2025cooper,
  title   = {Cooper: Co-Optimizing Policy and Reward Models in Reinforcement Learning for Large Language Models},
  author  = {Hong, Haitao and Yan, Yuchen and Wu, Xingyu and Hou, Guiyang and Zhang, Wenqi and Lu, Weiming and Shen, Yongliang and Xiao, Jun},
  journal = {arXiv preprint arXiv:2508.05613},
  year    = {2025}
}

@article{salemi2025learning,
  title   = {Learning from Natural Language Feedback for Personalized Question Answering},
  author  = {Salemi, Alireza and Zamani, Hamed},
  journal = {arXiv preprint arXiv:2508.10695},
  year    = {2025}
}

@article{li2025generalist,
  title   = {Generalist Reward Models: Found Inside Large Language Models},
  author  = {Li, Yi-Chen and Xu, Tian and Yu, Yang and Zhang, Xuqin and Chen, Xiong-Hui and Ling, Zhongxiang and Chao, Ningjing and Yuan, Lei and Zhou, Zhi-Hua},
  journal = {arXiv preprint arXiv:2506.23235},
  year    = {2025}
}

@article{ma2025general,
  title   = {General-reasoner: Advancing llm reasoning across all domains},
  author  = {Ma, Xueguang and Liu, Qian and Jiang, Dongfu and Zhang, Ge and Ma, Zejun and Chen, Wenhu},
  journal = {arXiv preprint arXiv:2505.14652},
  year    = {2025}
}

@article{whitehouse2025j1,
  title   = {J1: Incentivizing thinking in llm-as-a-judge via reinforcement learning},
  author  = {Whitehouse, Chenxi and Wang, Tianlu and Yu, Ping and Li, Xian and Weston, Jason and Kulikov, Ilia and Saha, Swarnadeep},
  journal = {arXiv preprint arXiv:2505.10320},
  year    = {2025}
}

@article{zhou2025breaking,
  title   = {Breaking the Exploration Bottleneck: Rubric-Scaffolded Reinforcement Learning for General LLM Reasoning},
  author  = {Zhou, Yang and Li, Sunzhu and Liu, Shunyu and Fang, Wenkai and Zhao, Jiale and Yang, Jingwen and Lv, Jianwei and Zhang, Kongcheng and Zhou, Yihe and Lu, Hengtong and others},
  journal = {arXiv preprint arXiv:2508.16949},
  year    = {2025}
}

@article{ning2025better,
  title   = {Better Language Model-Based Judging Reward Modeling through Scaling Comprehension Boundaries},
  author  = {Ning, Meiling and Zhang, Zhongbao and Ye, Junda and Guo, Jiabao and Guan, Qingyuan},
  journal = {arXiv preprint arXiv:2508.18212},
  year    = {2025}
}

@article{huang2025reinforcement,
  title   = {Reinforcement Learning with Rubric Anchors},
  author  = {Huang, Zenan and Zhuang, Yihong and Lu, Guoshan and Qin, Zeyu and Xu, Haokai and Zhao, Tianyu and Peng, Ru and Hu, Jiaqi and Shen, Zhanming and Hu, Xiaomeng and others},
  journal = {arXiv preprint arXiv:2508.12790},
  year    = {2025}
}

@article{liao2025rlmr,
  title   = {RLMR: Reinforcement Learning with Mixed Rewards for Creative Writing},
  author  = {Liao, Jianxing and Zhang, Tian and Feng, Xiao and Zhang, Yusong and Yang, Rui and Wang, Haorui and Wen, Bosi and Wang, Ziying and Shi, Runzhi},
  journal = {arXiv preprint arXiv:2508.18642},
  year    = {2025}
}

@article{dou2025pre,
  title   = {Pre-Trained Policy Discriminators are General Reward Models},
  author  = {Dou, Shihan and Liu, Shichun and Yang, Yuming and Zou, Yicheng and Zhou, Yunhua and Xing, Shuhao and Huang, Chenhao and Ge, Qiming and Song, Demin and Lv, Haijun and others},
  journal = {arXiv preprint arXiv:2507.05197},
  year    = {2025}
}

@article{xu2025tinyv,
  title   = {TinyV: Reducing False Negatives in Verification Improves RL for LLM Reasoning},
  author  = {Xu, Zhangchen and Li, Yuetai and Jiang, Fengqing and Ramasubramanian, Bhaskar and Niu, Luyao and Lin, Bill Yuchen and Poovendran, Radha},
  journal = {arXiv preprint arXiv:2505.14625},
  year    = {2025}
}

@article{zha2025rl,
  title   = {RL Tango: Reinforcing Generator and Verifier Together for Language Reasoning},
  author  = {Zha, Kaiwen and Gao, Zhengqi and Shen, Maohao and Hong, Zhang-Wei and Boning, Duane S and Katabi, Dina},
  journal = {arXiv preprint arXiv:2505.15034},
  year    = {2025}
}

@article{zhang2025critique,
  title   = {Critique-grpo: Advancing llm reasoning with natural language and numerical feedback},
  author  = {Zhang, Xiaoying and Sun, Hao and Zhang, Yipeng and Feng, Kaituo and Lu, Chaochao and Yang, Chao and Meng, Helen},
  journal = {arXiv preprint arXiv:2506.03106},
  year    = {2025}
}

@article{zheng2023judging,
  title   = {Judging llm-as-a-judge with mt-bench and chatbot arena},
  author  = {Zheng, Lianmin and Chiang, Wei-Lin and Sheng, Ying and Zhuang, Siyuan and Wu, Zhanghao and Zhuang, Yonghao and Lin, Zi and Li, Zhuohan and Li, Dacheng and Xing, Eric and others},
  journal = {Advances in neural information processing systems},
  volume  = {36},
  pages   = {46595--46623},
  year    = {2023}
}

@article{zhang2024generative,
  title   = {Generative verifiers: Reward modeling as next-token prediction},
  author  = {Zhang, Lunjun and Hosseini, Arian and Bansal, Hritik and Kazemi, Mehran and Kumar, Aviral and Agarwal, Rishabh},
  journal = {arXiv preprint arXiv:2408.15240},
  year    = {2024}
}

@article{liu2025inference,
  title   = {Inference-time scaling for generalist reward modeling},
  author  = {{Liu}, Zijun and Wang, Peiyi and Xu, Runxin and Ma, Shirong and Ruan, Chong and Li, Peng and Liu, Yang and Wu, Yu},
  journal = {arXiv preprint arXiv:2504.02495},
  year    = {2025}
}

@article{seed2025seed1,
  title   = {Seed1. 5-thinking: Advancing superb reasoning models with reinforcement learning},
  author  = {Seed, ByteDance and Chen, Jiaze and Fan, Tiantian and Liu, Xin and Liu, Lingjun and Lin, Zhiqi and Wang, Mingxuan and Wang, Chengyi and Wei, Xiangpeng and Xu, Wenyuan and others},
  journal = {arXiv preprint arXiv:2504.13914},
  year    = {2025}
}

@article{zeng2025reviewrl,
  title   = {ReviewRL: Towards Automated Scientific Review with RL},
  author  = {Zeng, Sihang and Tian, Kai and Zhang, Kaiyan and Gao, Junqi and Liu, Runze and Yang, Sa and Li, Jingxuan and Long, Xinwei and Ma, Jiaheng and Qi, Biqing and others},
  journal = {arXiv preprint arXiv:2508.10308},
  year    = {2025}
}

@article{he2025air,
  title   = {AIR: A Systematic Analysis of Annotations, Instructions, and Response Pairs in Preference Dataset},
  author  = {He, Bingxiang and Zhang, Wenbin and Song, Jiaxi and Qian, Cheng and Fu, Zixuan and Sun, Bowen and Ding, Ning and Hong, Haiwen and Huang, Longtao and Xue, Hui and others},
  journal = {arXiv preprint arXiv:2504.03612},
  year    = {2025}
}

@article{zhao2025genprm,
  title   = {GenPRM: Scaling Test-Time Compute of Process Reward Models via Generative Reasoning},
  author  = {Zhao, Jian and Liu, Runze and Zhang, Kaiyan and Zhou, Zhimu and Gao, Junqi and Li, Dong and Lyu, Jiafei and Qian, Zhouyi and Qi, Biqing and Li, Xiu and others},
  journal = {arXiv preprint arXiv:2504.00891},
  year    = {2025}
}

@article{xu2025unified,
  title   = {A Unified Pairwise Framework for RLHF: Bridging Generative Reward Modeling and Policy Optimization},
  author  = {Xu, Wenyuan and Zuo, Xiaochen and Xin, Chao and Yue, Yu and Yan, Lin and Wu, Yonghui},
  journal = {arXiv preprint arXiv:2504.04950},
  year    = {2025}
}

@article{chen2025rm,
  title   = {Rm-r1: Reward modeling as reasoning},
  author  = {Chen, Xiusi and Li, Gaotang and Wang, Ziqi and Jin, Bowen and Qian, Cheng and Wang, Yu and Wang, Hongru and Zhang, Yu and Zhang, Denghui and Zhang, Tong and others},
  journal = {arXiv preprint arXiv:2505.02387},
  year    = {2025}
}

@article{lu2025urpo,
  title   = {URPO: A Unified Reward \& Policy Optimization Framework for Large Language Models},
  author  = {Lu, Songshuo and Wang, Hua and Chen, Zhi and Tang, Yaohua},
  journal = {arXiv preprint arXiv:2507.17515},
  year    = {2025}
}

## Policy Optimal - Critic-based RL

@article{zhu2025vrpo,
  title   = {VRPO: Rethinking Value Modeling for Robust RL Training under Noisy Supervision},
  author  = {Zhu, Dingwei and Dou, Shihan and Xi, Zhiheng and Jin, Senjie and Zhang, Guoqiang and Zhang, Jiazheng and Ye, Junjie and Chai, Mingxu and Zhou, Enyu and Zhang, Ming and others},
  journal = {arXiv preprint arXiv:2508.03058},
  year    = {2025}
}


@inproceedings{schulman2015trust,
  title        = {Trust region policy optimization},
  author       = {Schulman, John and Levine, Sergey and Abbeel, Pieter and Jordan, Michael and Moritz, Philipp},
  booktitle    = {International conference on machine learning},
  pages        = {1889--1897},
  year         = {2015},
  organization = {PMLR}
}
@article{christiano2017deep,
  title   = {Deep reinforcement learning from human preferences},
  author  = {Christiano, Paul F and Leike, Jan and Brown, Tom and Martic, Miljan and Legg, Shane and Amodei, Dario},
  journal = {Advances in neural information processing systems},
  volume  = {30},
  year    = {2017}
}

@article{guo2024direct,
  title   = {Direct language model alignment from online ai feedback},
  author  = {Guo, Shangmin and Zhang, Biao and Liu, Tianlin and Liu, Tianqi and Khalman, Misha and Llinares, Felipe and Rame, Alexandre and Mesnard, Thomas and Zhao, Yao and Piot, Bilal and others},
  journal = {arXiv preprint arXiv:2402.04792},
  year    = {2024}
}

@article{richemond2024offline,
  title   = {Offline regularised reinforcement learning for large language models alignment},
  author  = {Richemond, Pierre Harvey and Tang, Yunhao and Guo, Daniel and Calandriello, Daniele and Azar, Mohammad Gheshlaghi and Rafailov, Rafael and Pires, Bernardo Avila and Tarassov, Eugene and Spangher, Lucas and Ellsworth, Will and others},
  journal = {arXiv preprint arXiv:2405.19107},
  year    = {2024}
}
@article{yue2025vapo,
  title   = {Vapo: Efficient and reliable reinforcement learning for advanced reasoning tasks},
  author  = {Yue, Yu and Yuan, Yufeng and Yu, Qiying and Zuo, Xiaochen and Zhu, Ruofei and Xu, Wenyuan and Chen, Jiaze and Wang, Chengyi and Fan, TianTian and Du, Zhengyin and others},
  journal = {arXiv preprint arXiv:2504.05118},
  year    = {2025}
}


@article{lu2024autopsv,
  title   = {Autopsv: Automated process-supervised verifier},
  author  = {Lu, Jianqiao and Dou, Zhiyang and Wang, Hongru and Cao, Zeyu and Dai, Jianbo and Feng, Yunlong and Guo, Zhijiang},
  journal = {Advances in Neural Information Processing Systems},
  volume  = {37},
  pages   = {79935--79962},
  year    = {2024}
}

@article{yuan2025s,
  title   = {What's Behind PPO's Collapse in Long-CoT? Value Optimization Holds the Secret},
  author  = {Yuan, Yufeng and Yue, Yu and Zhu, Ruofei and Fan, Tiantian and Yan, Lin},
  journal = {arXiv preprint arXiv:2503.01491},
  year    = {2025}
}

@article{schulman2015high,
  title   = {High-dimensional continuous control using generalized advantage estimation},
  author  = {Schulman, John and Moritz, Philipp and Levine, Sergey and Jordan, Michael and Abbeel, Pieter},
  journal = {arXiv preprint arXiv:1506.02438},
  year    = {2015}
}

@article{hu2025open,
  title   = {Open-reasoner-zero: An open source approach to scaling up reinforcement learning on the base model},
  author  = {Hu, Jingcheng and Zhang, Yinmin and Han, Qi and Jiang, Daxin and Zhang, Xiangyu and Shum, Heung-Yeung},
  journal = {arXiv preprint arXiv:2503.24290},
  year    = {2025}
}


@article{stiennon2020learning,
  title   = {Learning to summarize with human feedback},
  author  = {Stiennon, Nisan and Ouyang, Long and Wu, Jeffrey and Ziegler, Daniel and Lowe, Ryan and Voss, Chelsea and Radford, Alec and Amodei, Dario and Christiano, Paul F},
  journal = {Advances in neural information processing systems},
  volume  = {33},
  pages   = {3008--3021},
  year    = {2020}
}

@article{bai2022training,
  title   = {Training a helpful and harmless assistant with reinforcement learning from human feedback},
  author  = {Bai, Yuntao and Jones, Andy and Ndousse, Kamal and Askell, Amanda and Chen, Anna and DasSarma, Nova and Drain, Dawn and Fort, Stanislav and Ganguli, Deep and Henighan, Tom and others},
  journal = {arXiv preprint arXiv:2204.05862},
  year    = {2022}
}


## Policy Optimal - Critic-free RL

@article{liu2025cpgd,
  title   = {CPGD: Toward Stable Rule-based Reinforcement Learning for Language Models},
  author  = {{Liu}, Zongkai and Meng, Fanqing and Du, Lingxiao and Zhou, Zhixiang and Yu, Chao and Shao, Wenqi and Zhang, Qiaosheng},
  journal = {arXiv preprint arXiv:2505.12504},
  year    = {2025}
}

@article{liu2025part,
  title   = {Part I: Tricks or Traps? A Deep Dive into RL for LLM Reasoning},
  author  = {{Liu}, Zihe and Liu, Jiashun and He, Yancheng and Wang, Weixun and Liu, Jiaheng and Pan, Ling and Hu, Xinyu and Xiong, Shaopan and Huang, Ju and Hu, Jian and others},
  journal = {arXiv preprint arXiv:2508.08221},
  year    = {2025}
}

@article{ahmadian2024back,
  title   = {Back to basics: Revisiting reinforce style optimization for learning from human feedback in llms},
  author  = {Ahmadian, Arash and Cremer, Chris and Gall{\'e}, Matthias and Fadaee, Marzieh and Kreutzer, Julia and Pietquin, Olivier and {\"U}st{\"u}n, Ahmet and Hooker, Sara},
  journal = {arXiv preprint arXiv:2402.14740},
  year    = {2024}
}

@article{li2023remax,
  title   = {Remax: A simple, effective, and efficient reinforcement learning method for aligning large language models},
  author  = {Li, Ziniu and Xu, Tian and Zhang, Yushun and Lin, Zhihang and Yu, Yang and Sun, Ruoyu and Luo, Zhi-Quan},
  journal = {arXiv preprint arXiv:2310.10505},
  year    = {2023}
}

@article{williams1992simple,
  title     = {Simple statistical gradient-following algorithms for connectionist reinforcement learning},
  author    = {Williams, Ronald J},
  journal   = {Machine learning},
  volume    = {8},
  number    = {3},
  pages     = {229--256},
  year      = {1992},
  publisher = {Springer}
}

@article{hu2025reinforce++,
  title   = {Reinforce++: A simple and efficient approach for aligning large language models},
  author  = {Hu, Jian},
  journal = {arXiv preprint arXiv:2501.03262},
  year    = {2025}
}

@article{roux2025tapered,
  title   = {Tapered off-policy reinforce: Stable and efficient reinforcement learning for llms},
  author  = {Roux, Nicolas Le and Bellemare, Marc G and Lebensold, Jonathan and Bergeron, Arnaud and Greaves, Joshua and Fr{\'e}chette, Alex and Pelletier, Carolyne and Thibodeau-Laufer, Eric and Toth, S{\'a}ndor and Work, Sam},
  journal = {arXiv preprint arXiv:2503.14286},
  year    = {2025}
}

@article{zhu2025surprising,
  title   = {The surprising effectiveness of negative reinforcement in LLM reasoning},
  author  = {Zhu, Xinyu and Xia, Mengzhou and Wei, Zhepei and Chen, Wei-Lin and Chen, Danqi and Meng, Yu},
  journal = {arXiv preprint arXiv:2506.01347},
  year    = {2025}
}

@article{chen2025minimax,
  title   = {MiniMax-M1: Scaling Test-Time Compute Efficiently with Lightning Attention},
  author  = {Chen, Aili and Li, Aonian and Gong, Bangwei and Jiang, Binyang and Fei, Bo and Yang, Bo and Shan, Boji and Yu, Changqing and Wang, Chao and Zhu, Cheng and others},
  journal = {arXiv preprint arXiv:2506.13585},
  year    = {2025}
}

@article{zheng2025group,
  title   = {Group Sequence Policy Optimization},
  author  = {Zheng, Chujie and Liu, Shixuan and Li, Mingze and Chen, Xiong-Hui and Yu, Bowen and Gao, Chang and Dang, Kai and Liu, Yuqiong and Men, Rui and Yang, An and others},
  journal = {arXiv preprint arXiv:2507.18071},
  year    = {2025}
}

## Policy Optimal - Regular (yuchen)
@article{williams1991function,
  title     = {Function optimization using connectionist reinforcement learning algorithms},
  author    = {Williams, Ronald J and Peng, Jing},
  journal   = {Connection Science},
  volume    = {3},
  number    = {3},
  pages     = {241--268},
  year      = {1991},
  publisher = {Taylor \& Francis}
}

@article{eysenbach2021maximum,
  title   = {Maximum entropy RL (provably) solves some robust RL problems},
  author  = {Eysenbach, Benjamin and Levine, Sergey},
  journal = {arXiv preprint arXiv:2103.06257},
  year    = {2021}
}

@article{lyu2025exploring,
  title   = {Exploring the limit of outcome reward for learning mathematical reasoning},
  author  = {Lyu, Chengqi and Gao, Songyang and Gu, Yuzhe and Zhang, Wenwei and Gao, Jianfei and Liu, Kuikun and Wang, Ziyi and Li, Shuaibin and Zhao, Qian and Huang, Haian and others},
  journal = {arXiv preprint arXiv:2502.06781},
  year    = {2025}
}

@article{liu2025prorl,
  title   = {Prorl: Prolonged reinforcement learning expands reasoning boundaries in large language models},
  author  = {{Liu}, Mingjie and Diao, Shizhe and Lu, Ximing and Hu, Jian and Dong, Xin and Choi, Yejin and Kautz, Jan and Dong, Yi},
  journal = {arXiv preprint arXiv:2505.24864},
  year    = {2025}
}

@article{liu2025scaling,
  title   = {Scaling Up RL: Unlocking Diverse Reasoning in LLMs via Prolonged Training},
  author  = {{Liu}, Mingjie and Diao, Shizhe and Hu, Jian and Lu, Ximing and Dong, Xin and Zhang, Hao and Bukharin, Alexander and Zhang, Shaokun and Zeng, Jiaqi and Sreedhar, Makesh Narsimhan and others},
  journal = {arXiv preprint arXiv:2507.12507},
  year    = {2025}
}

@inproceedings{ZiebartMBD08,
  author    = {Brian D. Ziebart and
               Andrew L. Maas and
               J. Andrew Bagnell and
               Anind K. Dey},
  editor    = {Dieter Fox and
               Carla P. Gomes},
  title     = {Maximum Entropy Inverse Reinforcement Learning},
  booktitle = {Proceedings of the Twenty-Third {AAAI} Conference on Artificial Intelligence,
               {AAAI} 2008, Chicago, Illinois, USA, July 13-17, 2008},
  pages     = {1433--1438},
  publisher = {{AAAI} Press},
  year      = {2008},
  url       = {http://www.aaai.org/Library/AAAI/2008/aaai08-227.php}
}

@article{schulman2017equivalence,
  title   = {Equivalence between policy gradients and soft q-learning},
  author  = {Schulman, John and Chen, Xi and Abbeel, Pieter},
  journal = {arXiv preprint arXiv:1704.06440},
  year    = {2017}
}

@article{schulman2017proximal,
  title   = {Proximal policy optimization algorithms},
  author  = {Schulman, John and Wolski, Filip and Dhariwal, Prafulla and Radford, Alec and Klimov, Oleg},
  journal = {arXiv preprint arXiv:1707.06347},
  year    = {2017}
}

@inproceedings{haarnoja2018soft,
  title        = {Soft actor-critic: Off-policy maximum entropy deep reinforcement learning with a stochastic actor},
  author       = {Haarnoja, Tuomas and Zhou, Aurick and Abbeel, Pieter and Levine, Sergey},
  booktitle    = {International conference on machine learning},
  pages        = {1861--1870},
  year         = {2018},
  organization = {Pmlr}
}


@article{touvron2023llama,
  title   = {Llama 2: Open foundation and fine-tuned chat models},
  author  = {Touvron, Hugo and Martin, Louis and Stone, Kevin and Albert, Peter and Almahairi, Amjad and Babaei, Yasmine and Bashlykov, Nikolay and Batra, Soumya and Bhargava, Prajjwal and Bhosale, Shruti and others},
  journal = {arXiv preprint arXiv:2307.09288},
  year    = {2023}
}


@article{zeng2025simplerl,
  title   = {Simplerl-zoo: Investigating and taming zero reinforcement learning for open base models in the wild},
  author  = {Zeng, Weihao and Huang, Yuzhen and Liu, Qian and Liu, Wei and He, Keqing and Ma, Zejun and He, Junxian},
  journal = {arXiv preprint arXiv:2503.18892},
  year    = {2025}
}

@article{liu2025understanding,
  title   = {Understanding r1-zero-like training: A critical perspective},
  author  = {Liu, Zichen and Chen, Changyu and Li, Wenjun and Qi, Penghui and Pang, Tianyu and Du, Chao and Lee, Wee Sun and Lin, Min},
  journal = {arXiv preprint arXiv:2503.20783},
  year    = {2025}
}

@article{yan2025learning,
  title   = {Learning to reason under off-policy guidance},
  author  = {Yan, Jianhao and Li, Yafu and Hu, Zican and Wang, Zhi and Cui, Ganqu and Qu, Xiaoye and Cheng, Yu and Zhang, Yue},
  journal = {arXiv preprint arXiv:2504.14945},
  year    = {2025}
}


@article{abdin2025phi,
  title   = {Phi-4-reasoning technical report},
  author  = {Abdin, Marah and Agarwal, Sahaj and Awadallah, Ahmed and Balachandran, Vidhisha and Behl, Harkirat and Chen, Lingjiao and de Rosa, Gustavo and Gunasekar, Suriya and Javaheripi, Mojan and Joshi, Neel and others},
  journal = {arXiv preprint arXiv:2504.21318},
  year    = {2025}
}

@article{mukherjee2025reinforcement,
  title   = {Reinforcement Learning Finetunes Small Subnetworks in Large Language Models},
  author  = {Mukherjee, Sagnik and Yuan, Lifan and Hakkani-Tur, Dilek and Peng, Hao},
  journal = {arXiv preprint arXiv:2505.11711},
  year    = {2025}
}

@article{fu2025areal,
  title   = {AReaL: A Large-Scale Asynchronous Reinforcement Learning System for Language Reasoning},
  author  = {Fu, Wei and Gao, Jiaxuan and Shen, Xujie and Zhu, Chen and Mei, Zhiyu and He, Chuyi and Xu, Shusheng and Wei, Guo and Mei, Jun and Wang, Jiashu and others},
  journal = {arXiv preprint arXiv:2505.24298},
  year    = {2025}
}

@article{guo2025synthetic,
  title   = {Synthetic Data RL: Task Definition Is All You Need},
  author  = {Guo, Yiduo and Guo, Zhen and Huang, Chuanwei and Wang, Zi-Ang and Zhang, Zekai and Yu, Haofei and Zhang, Huishuai and Shen, Yikang},
  journal = {arXiv preprint arXiv:2505.17063},
  year    = {2025}
}

@article{yang2025not,
  title   = {Do Not Let Low-Probability Tokens Over-Dominate in RL for LLMs},
  author  = {Yang, Zhihe and Luo, Xufang and Wang, Zilong and Han, Dongqi and He, Zhiyuan and Li, Dongsheng and Xu, Yunjian},
  journal = {arXiv preprint arXiv:2505.12929},
  year    = {2025}
}


@article{gao2025one,
  title   = {One-shot Entropy Minimization},
  author  = {Gao, Zitian and Chen, Lynx and Zhou, Joey and Dai, Bryan},
  journal = {arXiv preprint arXiv:2505.20282},
  year    = {2025}
}


@article{wang2025beyond,
  title   = {Beyond the 80/20 rule: High-entropy minority tokens drive effective reinforcement learning for llm reasoning},
  author  = {Wang, Shenzhi and Yu, Le and Gao, Chang and Zheng, Chujie and Liu, Shixuan and Lu, Rui and Dang, Kai and Chen, Xionghui and Yang, Jianxin and Zhang, Zhenru and others},
  journal = {arXiv preprint arXiv:2506.01939},
  year    = {2025}
}


@article{wang2025stabilizing,
  title   = {Stabilizing Knowledge, Promoting Reasoning: Dual-Token Constraints for RLVR},
  author  = {Wang, Jiakang and Liu, Runze and Zhang, Fuzheng and Li, Xiu and Zhou, Guorui},
  journal = {arXiv preprint arXiv:2507.15778},
  year    = {2025}
}

@article{ASPO,
  title={ASPO: Asymmetric Importance Sampling Policy Optimization},
  author={Wang, Jiakang and Liu, Runze and Lin, Lei and Hu, Wenping and Li, Xiu and Zhang, Fuzheng and Zhou, Guorui and Gai, Kun},
  journal = {arXiv preprint arXiv:2510.06062},
  year    = {2025}
}

@article{xiang2025just,
  title   = {Just Enough Thinking: Efficient Reasoning with Adaptive Length Penalties Reinforcement Learning},
  author  = {Xiang, Violet and Blagden, Chase and Rafailov, Rafael and Lile, Nathan and Truong, Sang and Finn, Chelsea and Haber, Nick},
  journal = {arXiv preprint arXiv:2506.05256},
  year    = {2025}
}


@article{su2025between,
  title   = {Between underthinking and overthinking: An empirical study of reasoning length and correctness in llms},
  author  = {Su, Jinyan and Healey, Jennifer and Nakov, Preslav and Cardie, Claire},
  journal = {arXiv preprint arXiv:2505.00127},
  year    = {2025}
}


@article{aggarwal2025l1,
  title   = {L1: Controlling how long a reasoning model thinks with reinforcement learning},
  author  = {Aggarwal, Pranjal and Welleck, Sean},
  journal = {arXiv preprint arXiv:2503.04697},
  year    = {2025}
}

@article{luo2025o1,
  title   = {O1-pruner: Length-harmonizing fine-tuning for o1-like reasoning pruning},
  author  = {Luo, Haotian and Shen, Li and He, Haiying and Wang, Yibo and Liu, Shiwei and Li, Wei and Tan, Naiqiang and Cao, Xiaochun and Tao, Dacheng},
  journal = {arXiv preprint arXiv:2501.12570},
  year    = {2025}
}

@article{yuan2025efficient,
  title   = {Efficient RL Training for Reasoning Models via Length-Aware Optimization},
  author  = {Yuan, Danlong and Xie, Tian and Huang, Shaohan and Gong, Zhuocheng and Zhang, Huishuai and Luo, Chong and Wei, Furu and Zhao, Dongyan},
  journal = {arXiv preprint arXiv:2505.12284},
  year    = {2025}
}


## Policy Optimal - Exploration


## Rollout and Sampling - Curriculum Learning


## Rollout and Sampling - Structured Sampling


## Rollout and Sampling - Hyper-parameter Resets

@article{wu2025confucius3,
  title   = {Confucius3-Math: A Lightweight High-Performance Reasoning LLM for Chinese K-12 Mathematics Learning},
  author  = {Wu, Lixin and Cai, Na and Cheng, Qiao and Wang, Jiachen and Duan, Yitao},
  journal = {arXiv preprint arXiv:2506.18330},
  year    = {2025}
}


@article{fan2025truncated,
  title   = {Truncated Proximal Policy Optimization},
  author  = {Fan, Tiantian and Liu, Lingjun and Yue, Yu and Chen, Jiaze and Wang, Chengyi and Yu, Qiying and Zhang, Chi and Lin, Zhiqi and Zhu, Ruofei and Yuan, Yufeng and others},
  journal = {arXiv preprint arXiv:2506.15050},
  year    = {2025}
}

@article{li2025output,
  title   = {Output Length Effect on DeepSeek-R1's Safety in Forced Thinking},
  author  = {Li, Xuying and Li, Zhuo and Kosuga, Yuji and Bian, Victor},
  journal = {arXiv preprint arXiv:2503.01923},
  year    = {2025}
}

@article{liao2025enhancing,
  title   = {Enhancing Efficiency and Exploration in Reinforcement Learning for LLMs},
  author  = {Liao, Mengqi and Xi, Xiangyu and Chen, Ruinian and Leng, Jia and Hu, Yangen and Zeng, Ke and Liu, Shuai and Wan, Huaiyu},
  journal = {arXiv preprint arXiv:2505.18573},
  year    = {2025}
}

@article{shrivastava2025sample,
  title   = {Sample More to Think Less: Group Filtered Policy Optimization for Concise Reasoning},
  author  = {Shrivastava, Vaishnavi and Awadallah, Ahmed and Balachandran, Vidhisha and Garg, Shivam and Behl, Harkirat and Papailiopoulos, Dimitris},
  journal = {arXiv preprint arXiv:2508.09726},
  year    = {2025}
}

@article{arora2025training,
  title   = {Training language models to reason efficiently},
  author  = {Arora, Daman and Zanette, Andrea},
  journal = {arXiv preprint arXiv:2502.04463},
  year    = {2025}
}

@article{yu2025dapo,
  title   = {Dapo: An open-source llm reinforcement learning system at scale},
  author  = {Yu, Qiying and Zhang, Zheng and Zhu, Ruofei and Yuan, Yufeng and Zuo, Xiaochen and Yue, Yu and Dai, Weinan and Fan, Tiantian and Liu, Gaohong and Liu, Lingjun and others},
  journal = {arXiv preprint arXiv:2503.14476},
  year    = {2025}
}

@article{liu2025acereason,
  title   = {AceReason-Nemotron 1.1: Advancing Math and Code Reasoning through SFT and RL Synergy},
  author  = {Liu, Zihan and Yang, Zhuolin and Chen, Yang and Lee, Chankyu and Shoeybi, Mohammad and Catanzaro, Bryan and Ping, Wei},
  journal = {arXiv preprint arXiv:2506.13284},
  year    = {2025}
}

@misc{an2025polaris,
  title  = {POLARIS: A Post-Training Recipe for Scaling Reinforcement Learning on Advanced Reasoning Models},
  url    = {https://hkunlp.github.io/blog/2025/Polaris},
  author = {An, Chenxin and Xie, Zhihui and Li, Xiaonan and Li, Lei and Zhang, Jun and Gong, Shansan and Zhong, Ming and Xu, Jingjing and Qiu, Xipeng and Wang, Mingxuan and Kong, Lingpeng},
  year   = {2025}
}

## Rollout and Sampling - Latent Reasoning RL


# RL Infra
## Open-Source RL


## Multi-Agent


##Exernal data for off policy 
@article{wang2024offline,
  title   = {Offline reinforcement learning for llm multi-step reasoning},
  author  = {Wang, Huaijie and Hao, Shibo and Dong, Hanze and Zhang, Shenao and Bao, Yilin and Yang, Ziran and Wu, Yi},
  journal = {arXiv preprint arXiv:2412.16145},
  year    = {2024}
}

@article{tang2025rl,
  title   = {RL-finetuning LLMs from on-and off-policy data with a single algorithm},
  author  = {Tang, Yunhao and Cohen, Taco and Zhang, David W and Valko, Michal and Munos, R{\'e}mi},
  journal = {arXiv preprint arXiv:2503.19612},
  year    = {2025}
}

@inproceedings{baheti2024leftover,
  title     = {Leftover Lunch: Advantage-based Offline Reinforcement Learning for Language Models},
  author    = {Baheti, Ashutosh and Lu, Ximing and Brahman, Faeze and Le Bras, Ronan and Sap, Maarten and Riedl, Mark O},
  booktitle = {ICLR},
  year      = {2024}
}

@article{gulcehre2023reinforced,
  title   = {Reinforced self-training (rest) for language modeling},
  author  = {Gulcehre, Caglar and Paine, Tom Le and Srinivasan, Srivatsan and Konyushkova, Ksenia and Weerts, Lotte and Sharma, Abhishek and Siddhant, Aditya and Ahern, Alex and Wang, Miaosen and Gu, Chenjie and others},
  journal = {arXiv preprint arXiv:2308.08998},
  year    = {2023}
}

@article{pouplin2024synergy,
  title   = {The Synergy of LLMs \& RL Unlocks Offline Learning of Generalizable Language-Conditioned Policies with Low-fidelity Data},
  author  = {Pouplin, Thomas and Kobalczyk, Katarzyna and Sun, Hao and van der Schaar, Mihaela},
  journal = {arXiv preprint arXiv:2412.06877},
  year    = {2024}
}



## Experience replay
@article{wang2025eframe,
  title   = {EFRame: Deeper Reasoning via Exploration-Filtering-Replay Reinforcement Learning Framework},
  author  = {Wang, Chen and Wei, Lai and Zhang, Yanzhi and Shao, Chenyang and Dan, Zedong and Huang, Weiran and Wang, Yue and Zhang, Yuzhi},
  journal = {arXiv preprint arXiv:2506.22200},
  year    = {2025}
}

@article{chen2024enhancing,
  title   = {Enhancing LLM Agents for Code Generation with Possibility and Pass-rate Prioritized Experience Replay},
  author  = {Chen, Yuyang and Zhao, Kaiyan and Wang, Yiming and Yang, Ming and Zhang, Jian and Niu, Xiaoguang},
  journal = {arXiv preprint arXiv:2410.12236},
  year    = {2024}
}

@article{feng2025get,
  title   = {Get Experience from Practice: LLM Agents with Record \& Replay},
  author  = {Feng, Erhu and Zhou, Wenbo and Liu, Zibin and Chen, Le and Dong, Yunpeng and Zhang, Cheng and Zhao, Yisheng and Du, Dong and Hua, Zhichao and Xia, Yubin and others},
  journal = {arXiv preprint arXiv:2505.17716},
  year    = {2025}
}



## Memory RL
@article{tang2025agent,
  title   = {Agent kb: Leveraging cross-domain experience for agentic problem solving},
  author  = {Tang, Xiangru and Qin, Tianrui and Peng, Tianhao and Zhou, Ziyang and Shao, Daniel and Du, Tingting and Wei, Xinming and Xia, Peng and Wu, Fang and Zhu, He and others},
  journal = {arXiv preprint arXiv:2507.06229},
  year    = {2025}
}

@article{dou2025improving,
  title   = {Improving rl exploration for llm reasoning through retrospective replay},
  author  = {Dou, Shihan and Wu, Muling and Xu, Jingwen and Zheng, Rui and Gui, Tao and Zhang, Qi and Huang, Xuanjing},
  journal = {arXiv preprint arXiv:2504.14363},
  year    = {2025}
}

@article{shinn2023reflexion,
  title   = {Reflexion: Language agents with verbal reinforcement learning},
  author  = {Shinn, Noah and Cassano, Federico and Gopinath, Ashwin and Narasimhan, Karthik and Yao, Shunyu},
  journal = {Advances in Neural Information Processing Systems},
  volume  = {36},
  pages   = {8634--8652},
  year    = {2023}
}

@article{yang2025agentic,
  title   = {Agentic Episodic Control},
  author  = {Yang, Xidong and Li, Wenhao and Sheng, Junjie and Shen, Chuyun and Hua, Yun and Wang, Xiangfeng},
  journal = {arXiv preprint arXiv:2506.01442},
  year    = {2025}
}

@article{liu2023think,
  title   = {Think-in-memory: Recalling and post-thinking enable llms with long-term memory},
  author  = {Liu, Lei and Yang, Xiaoyan and Shen, Yue and Hu, Binbin and Zhang, Zhiqiang and Gu, Jinjie and Zhang, Guannan},
  journal = {arXiv preprint arXiv:2311.08719},
  year    = {2023}
}

@inproceedings{zhong2024memorybank,
  title        = {MemoryBank: Enhancing Large Language Models with Long-Term Memory},
  author       = {Zhong, Wanjun and Guo, Lianghong and Gao, Qiqi and Ye, He and Wang, Yanlin},
  booktitle    = {38th AAAI Conference on Artificial Intelligence, AAAI 2024, Feb 20-27 2024 Vancouver, Canada},
  volume       = {38},
  pages        = {19724--19731},
  year         = {2024},
  organization = {Association for the Advancement of Artificial Intelligence (AAAI)}
}

@article{hu2023chatdb,
  title   = {ChatDB: Augmenting LLMs with Databases as Their Symbolic Memory},
  author  = {Hu, Chenxu and Fu, Jie and Du, Chenzhuang and Luo, Simian and Zhao, Junbo and Zhao, Hang},
  journal = {arXiv e-prints},
  pages   = {arXiv--2306},
  year    = {2023}
}

@article{modarressi2023ret,
  title   = {Ret-llm: Towards a general read-write memory for large language models},
  author  = {Modarressi, Ali and Imani, Ayyoob and Fayyaz, Mohsen and Sch{\"u}tze, Hinrich},
  journal = {arXiv preprint arXiv:2305.14322},
  year    = {2023}
}

@article{silver2025welcome,
  title   = {Welcome to the era of experience},
  author  = {Silver, David and Sutton, Richard S},
  journal = {Google AI},
  volume  = {1},
  year    = {2025}
}

@article{yu2025memagent,
  title   = {MemAgent: Reshaping Long-Context LLM with Multi-Conv RL-based Memory Agent},
  author  = {Yu, Hongli and Chen, Tinghong and Feng, Jiangtao and Chen, Jiangjie and Dai, Weinan and Yu, Qiying and Zhang, Ya-Qin and Ma, Wei-Ying and Liu, Jingjing and Wang, Mingxuan and others},
  journal = {arXiv preprint arXiv:2507.02259},
  year    = {2025}
}

@article{zhang2025rlep,
  title   = {RLEP: Reinforcement Learning with Experience Replay for LLM Reasoning},
  author  = {Zhang, Hongzhi and Fu, Jia and Zhang, Jingyuan and Fu, Kai and Wang, Qi and Zhang, Fuzheng and Zhou, Guorui},
  journal = {arXiv preprint arXiv:2507.07451},
  year    = {2025}
}

@article{zhou2025mem1,
  title   = {MEM1: Learning to Synergize Memory and Reasoning for Efficient Long-Horizon Agents},
  author  = {Zhou, Zijian and Qu, Ao and Wu, Zhaoxuan and Kim, Sunghwan and Prakash, Alok and Rus, Daniela and Zhao, Jinhua and Low, Bryan Kian Hsiang and Liang, Paul Pu},
  journal = {arXiv preprint arXiv:2506.15841},
  year    = {2025}
}

@article{xu2025mem,
  title   = {A-mem: Agentic memory for llm agents},
  author  = {Xu, Wujiang and Mei, Kai and Gao, Hang and Tan, Juntao and Liang, Zujie and Zhang, Yongfeng},
  journal = {arXiv preprint arXiv:2502.12110},
  year    = {2025}
}

@article{chhikara2025mem0,
  title   = {Mem0: Building production-ready ai agents with scalable long-term memory},
  author  = {Chhikara, Prateek and Khant, Dev and Aryan, Saket and Singh, Taranjeet and Yadav, Deshraj},
  journal = {arXiv preprint arXiv:2504.19413},
  year    = {2025}
}

@article{wang2023enhancing,
  title   = {Enhancing large language model with self-controlled memory framework},
  author  = {Wang, Bing and Liang, Xinnian and Yang, Jian and Huang, Hui and Wu, Shuangzhi and Wu, Peihao and Lu, Lu and Ma, Zejun and Li, Zhoujun},
  journal = {arXiv preprint arXiv:2304.13343},
  year    = {2023}
}

@article{yang2024text,
  title   = {Memory3: Language Modeling with Explicit Memory},
  author  = {Yang, Hongkang and Lin, Zehao and Wang, Wenjin and Wu, Hao and Li, Zhiyu and Tang, Bo and Wei, Wenqiang and Wang, Jinbo and Tang, Zeyun and Song, Shichao and others},
  journal = {arXiv preprint arXiv:2407.01178},
  year    = {2024}
}

@article{gao2024memory,
  title   = {Memory sharing for large language model based agents},
  author  = {Gao, Hang and Zhang, Yongfeng},
  journal = {arXiv preprint arXiv:2404.09982},
  year    = {2024}
}

@article{alonso2024toward,
  title   = {Toward conversational agents with context and time sensitive long-term memory},
  author  = {Alonso, Nick and Figliolia, Tom{\'a}s and Ndirango, Anthony and Millidge, Beren},
  journal = {arXiv preprint arXiv:2406.00057},
  year    = {2024}
}

@article{wang2024wise,
  title   = {Wise: Rethinking the knowledge memory for lifelong model editing of large language models},
  author  = {Wang, Peng and Li, Zexi and Zhang, Ningyu and Xu, Ziwen and Yao, Yunzhi and Jiang, Yong and Xie, Pengjun and Huang, Fei and Chen, Huajun},
  journal = {Advances in Neural Information Processing Systems},
  volume  = {37},
  pages   = {53764--53797},
  year    = {2024}
}

@article{anokhin2024arigraph,
  title   = {Arigraph: Learning knowledge graph world models with episodic memory for llm agents},
  author  = {Anokhin, Petr and Semenov, Nikita and Sorokin, Artyom and Evseev, Dmitry and Kravchenko, Andrey and Burtsev, Mikhail and Burnaev, Evgeny},
  journal = {arXiv preprint arXiv:2407.04363},
  year    = {2024}
}

# RL Dataset
## Exiting Dataset
@article{li2025limr,
  title   = {Limr: Less is more for rl scaling},
  author  = {Li, Xuefeng and Zou, Haoyang and Liu, Pengfei},
  journal = {arXiv preprint arXiv:2502.11886},
  year    = {2025}
}

@article{ye2025limo,
  title   = {Limo: Less is more for reasoning},
  author  = {Ye, Yixin and Huang, Zhen and Xiao, Yang and Chern, Ethan and Xia, Shijie and Liu, Pengfei},
  journal = {arXiv preprint arXiv:2502.03387},
  year    = {2025}
}

@article{he2025deepmath,
  title   = {Deepmath-103k: A large-scale, challenging, decontaminated, and verifiable mathematical dataset for advancing reasoning},
  author  = {He, Zhiwei and Liang, Tian and Xu, Jiahao and Liu, Qiuzhi and Chen, Xingyu and Wang, Yue and Song, Linfeng and Yu, Dian and Liang, Zhenwen and Wang, Wenxuan and others},
  journal = {arXiv preprint arXiv:2504.11456},
  year    = {2025}
}


@article{li2025miromind,
  title   = {MiroMind-M1: An Open-Source Advancement in Mathematical Reasoning via Context-Aware Multi-Stage Policy Optimization},
  author  = {Li, Xingxuan and Xiao, Yao and Ng, Dianwen and Ye, Hai and Deng, Yue and Lin, Xiang and Wang, Bin and Mo, Zhanfeng and Zhang, Chong and Zhang, Yueyi and others},
  journal = {arXiv preprint arXiv:2507.14683},
  year    = {2025}
}

@article{chen2025empirical,
  title   = {An empirical study on eliciting and improving r1-like reasoning models},
  author  = {Chen, Zhipeng and Min, Yingqian and Zhang, Beichen and Chen, Jie and Jiang, Jinhao and Cheng, Daixuan and Zhao, Wayne Xin and Liu, Zheng and Miao, Xu and Lu, Yang and others},
  journal = {arXiv preprint arXiv:2503.04548},
  year    = {2025}
}

@article{ai2025m2,
  title   = {M2-Reasoning: Empowering MLLMs with Unified General and Spatial Reasoning},
  author  = {AI, Inclusion and Wang, Fudong and Liu, Jiajia and Chen, Jingdong and Zhou, Jun and Ji, Kaixiang and Ru, Lixiang and Guo, Qingpei and Zheng, Ruobing and Li, Tianqi and others},
  journal = {arXiv preprint arXiv:2507.08306},
  year    = {2025}
}

@article{lin2025embrace,
  title   = {EmbRACE-3K: Embodied Reasoning and Action in Complex Environments},
  author  = {Lin, Mingxian and Huang, Wei and Li, Yitang and Jiang, Chengjie and Wu, Kui and Zhong, Fangwei and Qian, Shengju and Wang, Xin and Qi, Xiaojuan},
  journal = {arXiv preprint arXiv:2507.10548},
  year    = {2025}
}

@article{xu2024llava,
  title   = {Llava-cot: Let vision language models reason step-by-step},
  author  = {Xu, Guowei and Jin, Peng and Wu, Ziang and Li, Hao and Song, Yibing and Sun, Lichao and Yuan, Li},
  journal = {arXiv preprint arXiv:2411.10440},
  year    = {2024}
}

@article{guha2025openthoughts,
  title   = {OpenThoughts: Data Recipes for Reasoning Models},
  author  = {Guha, Etash and Marten, Ryan and Keh, Sedrick and Raoof, Negin and Smyrnis, Georgios and Bansal, Hritik and Nezhurina, Marianna and Mercat, Jean and Vu, Trung and Sprague, Zayne and others},
  journal = {arXiv preprint arXiv:2506.04178},
  year    = {2025}
}

@article{cai2025reasoning,
  title   = {Reasoning with OmniThought: A Large CoT Dataset with Verbosity and Cognitive Difficulty Annotations},
  author  = {Cai, Wenrui and Wang, Chengyu and Yan, Junbing and Huang, Jun and Fang, Xiangzhong},
  journal = {arXiv preprint arXiv:2505.10937},
  year    = {2025}
}

@article{yu2025z1,
  title   = {Z1: Efficient test-time scaling with code},
  author  = {Yu, Zhaojian and Wu, Yinghao and Zhao, Yilun and Cohan, Arman and Zhang, Xiao-Ping},
  journal = {arXiv preprint arXiv:2504.00810},
  year    = {2025}
}

@article{ma2025sci,
  title   = {SCI-Reason: A Dataset with Chain-of-Thought Rationales for Complex Multimodal Reasoning in Academic Areas},
  author  = {Ma, Chenghao and Ding, Junpeng and Zhang, Jun and Ma, Ziyan and Qing, Huang and Gao, Bofei and Chen, Liang and Song, Meina and others},
  journal = {arXiv preprint arXiv:2504.06637},
  year    = {2025}
}

@article{hegde2025chartqa,
  title   = {ChartQA-X: Generating Explanations for Charts},
  author  = {Hegde, Shamanthak and Fazli, Pooyan and Seifi, Hasti},
  journal = {arXiv preprint arXiv:2504.13275},
  year    = {2025}
}

@article{wei2025chartmind,
  title   = {ChartMind: A Comprehensive Benchmark for Complex Real-world Multimodal Chart Question Answering},
  author  = {Wei, Jingxuan and Xu, Nan and Zhu, Junnan and Hao, Yanni and Wu, Gaowei and Yu, Bihui and Wang, Lei},
  journal = {arXiv preprint arXiv:2505.23242},
  year    = {2025}
}

@article{jia2025chartreasoner,
  title   = {ChartReasoner: Code-Driven Modality Bridging for Long-Chain Reasoning in Chart Question Answering},
  author  = {Jia, Caijun and Xu, Nan and Wei, Jingxuan and Wang, Qingli and Wang, Lei and Yu, Bihui and Zhu, Junnan},
  journal = {arXiv preprint arXiv:2506.10116},
  year    = {2025}
}

@article{chen2025chart,
  title   = {Chart-R1: Chain-of-Thought Supervision and Reinforcement for Advanced Chart Reasoner},
  author  = {Chen, Lei and Zhao, Xuanle and Zeng, Zhixiong and Huang, Jing and Zhong, Yufeng and Ma, Lin},
  journal = {arXiv preprint arXiv:2507.15509},
  year    = {2025}
}

@article{yuan2025naturalreasoning,
  title   = {Naturalreasoning: Reasoning in the wild with 2.8 m challenging questions},
  author  = {Yuan, Weizhe and Yu, Jane and Jiang, Song and Padthe, Karthik and Li, Yang and Kulikov, Ilia and Cho, Kyunghyun and Wang, Dong and Tian, Yuandong and Weston, Jason E and others},
  journal = {arXiv preprint arXiv:2502.13124},
  year    = {2025}
}

@article{lu2025scp,
  title   = {Scp-116k: A high-quality problem-solution dataset and a generalized pipeline for automated extraction in the higher education science domain},
  author  = {Lu, Dakuan and Tan, Xiaoyu and Xu, Rui and Yao, Tianchu and Qu, Chao and Chu, Wei and Xu, Yinghui and Qi, Yuan},
  journal = {arXiv preprint arXiv:2501.15587},
  year    = {2025}
}

@article{li2025beyond,
  title   = {Beyond Chemical QA: Evaluating LLM's Chemical Reasoning with Modular Chemical Operations},
  author  = {Li, Hao and Cao, He and Feng, Bin and Shao, Yanjun and Tang, Xiangru and Yan, Zhiyuan and Yuan, Li and Tian, Yonghong and Li, Yu},
  journal = {arXiv preprint arXiv:2505.21318},
  year    = {2025}
}

@article{sun2025reasonmed,
  title   = {ReasonMed: A 370K Multi-Agent Generated Dataset for Advancing Medical Reasoning},
  author  = {Sun, Yu and Qian, Xingyu and Xu, Weiwen and Zhang, Hao and Xiao, Chenghao and Li, Long and Rong, Yu and Huang, Wenbing and Bai, Qifeng and Xu, Tingyang},
  journal = {arXiv preprint arXiv:2506.09513},
  year    = {2025}
}

@article{zuo2025medxpertqa,
  title={Medxpertqa: Benchmarking expert-level medical reasoning and understanding},
  author={Zuo, Yuxin and Qu, Shang and Li, Yifei and Chen, Zhangren and Zhu, Xuekai and Hua, Ermo and Zhang, Kaiyan and Ding, Ning and Zhou, Bowen},
  journal={arXiv preprint arXiv:2501.18362},
  year={2025}
}

@article{sellergren2025medgemma,
  title={Medgemma technical report},
  author={Sellergren, Andrew and Kazemzadeh, Sahar and Jaroensri, Tiam and Kiraly, Atilla and Traverse, Madeleine and Kohlberger, Timo and Xu, Shawn and Jamil, Fayaz and Hughes, C{\'\i}an and Lau, Charles and others},
  journal={arXiv preprint arXiv:2507.05201},
  year={2025}
}

@article{jing2025reason,
  title={Reason Like a Radiologist: Chain-of-Thought and Reinforcement Learning for Verifiable Report Generation},
  author={Jing, Peiyuan and Lee, Kinhei and Zhang, Zhenxuan and Zhou, Huichi and Yuan, Zhengqing and Gao, Zhifan and Zhu, Lei and Papanastasiou, Giorgos and Fang, Yingying and Yang, Guang},
  journal={arXiv preprint arXiv:2504.18453},
  year={2025}
}

@article{zheng2025scaling,
  title   = {Scaling physical reasoning with the physics dataset},
  author  = {Zheng, Shenghe and Cheng, Qianjia and Yao, Junchi and Wu, Mengsong and He, Haonan and Ding, Ning and Cheng, Yu and Hu, Shuyue and Bai, Lei and Zhou, Dongzhan and others},
  journal = {arXiv preprint arXiv:2506.00022},
  year    = {2025}
}

@article{baumgartner2025peerqa,
  title   = {PeerQA: A Scientific Question Answering Dataset from Peer Reviews},
  author  = {Baumg{\"a}rtner, Tim and Briscoe, Ted and Gurevych, Iryna},
  journal = {arXiv preprint arXiv:2502.13668},
  year    = {2025}
}

@article{fan2025megascience,
  title   = {MegaScience: Pushing the Frontiers of Post-Training Datasets for Science Reasoning},
  author  = {Fan, Run-Ze and Wang, Zengzhi and Liu, Pengfei},
  journal = {arXiv preprint arXiv:2507.16812},
  year    = {2025}
}

@article{corbiere2025drivingvqa,
  title   = {Drivingvqa: Analyzing visual chain-of-thought reasoning of vision language models in real-world scenarios with driving theory tests},
  author  = {Corbi{\`e}re, Charles and Roburin, Simon and Montariol, Syrielle and Bosselut, Antoine and Alahi, Alexandre},
  journal = {arXiv e-prints},
  pages   = {arXiv--2501},
  year    = {2025}
}

@article{chae2025one,
  title   = {One Missing Piece for Open-Source Reasoning Models: A Dataset to Mitigate Cold-Starting Short CoT LLMs in RL},
  author  = {Chae, Hyungjoo and Kang, Dongjin and Kim, Jihyuk and Kwak, Beong-woo and Park, Sunghyun and Park, Haeju and Yeo, Jinyoung and Lee, Moontae and Lee, Kyungjae},
  journal = {arXiv preprint arXiv:2506.02338},
  year    = {2025}
}

@article{xu2025redstar,
  title   = {Redstar: Does scaling long-cot data unlock better slow-reasoning systems?},
  author  = {Xu, Haotian and Wu, Xing and Wang, Weinong and Li, Zhongzhi and Zheng, Da and Chen, Boyuan and Hu, Yi and Kang, Shijia and Ji, Jiaming and Zhang, Yingying and others},
  journal = {arXiv preprint arXiv:2501.11284},
  year    = {2025}
}

@misc{2025synthetic1,
  title  = {SYNTHETIC-1: Two Million Collaboratively Generated Reasoning Traces from Deepseek-R1},
  author = {Justus Mattern and Sami Jaghouar and Manveer Basra and Jannik Straube and Matthew Di Ferrante and Felix Gabriel and Jack Min Ong and Vincent Weisser and Johannes Hagemann},
  year   = {2025},
  url    = {https://www.primeintellect.ai/blog/synthetic-1-release}
}

@article{he2025skywork,
  title   = {Skywork open reasoner 1 technical report},
  author  = {He, Jujie and Liu, Jiacai and Liu, Chris Yuhao and Yan, Rui and Wang, Chaojie and Cheng, Peng and Zhang, Xiaoyu and Zhang, Fuxiang and Xu, Jiacheng and Shen, Wei and others},
  journal = {arXiv preprint arXiv:2505.22312},
  year    = {2025}
}

@article{zhao20251,
  title   = {1.4 million open-source distilled reasoning dataset to empower large language model training},
  author  = {Zhao, Han and Wang, Haotian and Peng, Yiping and Zhao, Sitong and Tian, Xiaoyu and Chen, Shuaiting and Ji, Yunjie and Li, Xiangang},
  journal = {arXiv preprint arXiv:2503.19633},
  year    = {2025}
}

@misc{bespoke_stratos,
  author       = {Bespoke Labs},
  title        = {Bespoke-Stratos: The unreasonable effectiveness of reasoning distillation},
  howpublished = {https://www.bespokelabs.ai/blog/bespoke-stratos-the-unreasonable-effectiveness-of-reasoning-distillation},
  note         = {Accessed: 2025-01-22},
  year         = {2025}
}

@article{muennighoff2025s1,
  title   = {s1: Simple test-time scaling},
  author  = {Muennighoff, Niklas and Yang, Zitong and Shi, Weijia and Li, Xiang Lisa and Fei-Fei, Li and Hajishirzi, Hannaneh and Zettlemoyer, Luke and Liang, Percy and Cand{\`e}s, Emmanuel and Hashimoto, Tatsunori},
  journal = {arXiv preprint arXiv:2501.19393},
  year    = {2025}
}

@article{liu2025synlogic,
  title   = {SynLogic: Synthesizing Verifiable Reasoning Data at Scale for Learning Logical Reasoning and Beyond},
  author  = {Liu, Junteng and Fan, Yuanxiang and Jiang, Zhuo and Ding, Han and Hu, Yongyi and Zhang, Chi and Shi, Yiqi and Weng, Shitong and Chen, Aili and Chen, Shiqi and others},
  journal = {arXiv preprint arXiv:2505.19641},
  year    = {2025}
}

@article{estermann2024puzzles,
  title   = {Puzzles: A benchmark for neural algorithmic reasoning},
  author  = {Estermann, Benjamin and Lanzend{\"o}rfer, Luca and Niedermayr, Yannick and Wattenhofer, Roger},
  journal = {Advances in Neural Information Processing Systems},
  volume  = {37},
  pages   = {127059--127098},
  year    = {2024}
}

@article{cheng2025revisiting,
  title   = {Revisiting Reinforcement Learning for LLM Reasoning from A Cross-Domain Perspective},
  author  = {Cheng, Zhoujun and Hao, Shibo and Liu, Tianyang and Zhou, Fan and Xie, Yutao and Yao, Feng and Bian, Yuexin and Zhuang, Yonghao and Dey, Nilabjo and Zha, Yuheng and others},
  journal = {arXiv preprint arXiv:2506.14965},
  year    = {2025}
}

@article{brown2024large,
  title   = {Large language monkeys: Scaling inference compute with repeated sampling},
  author  = {Brown, Bradley and Juravsky, Jordan and Ehrlich, Ryan and Clark, Ronald and Le, Quoc V and R{\'e}, Christopher and Mirhoseini, Azalia},
  journal = {arXiv preprint arXiv:2407.21787},
  year    = {2024}
}

@article{snell2024scaling,
  title   = {Scaling llm test-time compute optimally can be more effective than scaling model parameters},
  author  = {Snell, Charlie and Lee, Jaehoon and Xu, Kelvin and Kumar, Aviral},
  journal = {arXiv preprint arXiv:2408.03314},
  year    = {2024}
}

@article{wu2025phd,
  title   = {Phd knowledge not required: A reasoning challenge for large language models},
  author  = {Wu, Zixuan and Lucchetti, Francesca and Boruch-Gruszecki, Aleksander and Zhao, Jingmiao and Anderson, Carolyn Jane and Biswas, Joydeep and Cassano, Federico and Feldman, Molly Q and Guha, Arjun},
  journal = {arXiv preprint arXiv:2502.01584},
  year    = {2025}
}

@misc{AM-DeepSeek-R1-0528-Distilled,
  title  = {AM-DeepSeek-R1-0528-Distilled},
  url    = {https://github.com/a-m-team/a-m-models},
  author = {a-m-team},
  month  = {June},
  year   = {2025}
}

@misc{QuixiAI_DolphinR1_2025,
  title        = {Dolphin R1 Dataset},
  author       = {Dolphin Team},
  year         = {2025},
  howpublished = {\url{https://huggingface.co/datasets/QuixiAI/dolphin-r1}},
  note         = {Dataset, Apache-2.0 license},
  url          = {https://huggingface.co/datasets/QuixiAI/dolphin-r1}
}


@article{leng2025crosswordbench,
  title   = {CrossWordBench: Evaluating the Reasoning Capabilities of LLMs and LVLMs with Controllable Puzzle Generation},
  author  = {Leng, Jixuan and Huang, Chengsong and Huang, Langlin and Lin, Bill Yuchen and Cohen, William W and Wang, Haohan and Huang, Jiaxin},
  journal = {arXiv preprint arXiv:2504.00043},
  year    = {2025}
}

@article{liu2025logiccat,
  title   = {LogicCat: A Chain-of-Thought Text-to-SQL Benchmark for Multi-Domain Reasoning Challenges},
  author  = {Liu, Tao and Zan, Hongying and Li, Yifan and Zhang, Dixuan and Kong, Lulu and Liu, Haixin and Hou, Jiaming and Zheng, Aoze and Li, Rui and Qiao, Yiming and others},
  journal = {arXiv preprint arXiv:2505.18744},
  year    = {2025}
}

@article{xia2025leetcodedataset,
  title   = {Leetcodedataset: A temporal dataset for robust evaluation and efficient training of code llms},
  author  = {Xia, Yunhui and Shen, Wei and Wang, Yan and Liu, Jason Klein and Sun, Huifeng and Wu, Siyue and Hu, Jian and Xu, Xiaolong},
  journal = {arXiv preprint arXiv:2504.14655},
  year    = {2025}
}

@misc{code-r1,
  title        = {Code-R1: Reproducing R1 for Code with Reliable Rewards},
  author       = {Liu, Jiawei and Zhang, Lingming},
  howpublished = {\url{https://github.com/ganler/code-r1}},
  year         = {2025}
}

@misc{miromind2024opendata,
  title  = {MiroVerse V0.1: A Reproducible, Full-Trajectory, Ever-Growing Deep Research Dataset},
  author = {MiroMind Data Team},
  year   = {2025},
  url    = {https://huggingface.co/datasets/miromind-ai/MiroVerse-v0.1}
}


@misc{luo2025deepcoder,
  title        = {DeepCoder: A Fully Open-Source 14B Coder at O3-mini Level},
  author       = {Michael Luo and Sijun Tan and Roy Huang and Ameen Patel and Alpay Ariyak and Qingyang Wu and Xiaoxiang Shi and Rachel Xin and Colin Cai and Maurice Weber and Ce Zhang and Li Erran Li and Raluca Ada Popa and Ion Stoica},
  howpublished = {\url{https://pretty-radio-b75.notion.site/DeepCoder-A-Fully-Open-Source-14B-Coder-at-O3-mini-Level-1cf81902c14680b3bee5eb349a512a51}},
  note         = {Notion Blog},
  year         = {2025}
}

@article{ahmad2025opencodeinstruct,
  title   = {OpenCodeInstruct: A Large-scale Instruction Tuning Dataset for Code LLMs},
  author  = {Ahmad, Wasi Uddin and Ficek, Aleksander and Samadi, Mehrzad and Huang, Jocelyn and Noroozi, Vahid and Majumdar, Somshubra and Ginsburg, Boris},
  journal = {arXiv preprint arXiv:2504.04030},
  year    = {2025}
}

@article{ahmad2025opencodereasoning,
  title   = {Opencodereasoning: Advancing data distillation for competitive coding},
  author  = {Ahmad, Wasi Uddin and Narenthiran, Sean and Majumdar, Somshubra and Ficek, Aleksander and Jain, Siddhartha and Huang, Jocelyn and Noroozi, Vahid and Ginsburg, Boris},
  journal = {arXiv preprint arXiv:2504.01943},
  year    = {2025}
}

@article{liu2025rstar,
  title   = {rStar-Coder: Scaling Competitive Code Reasoning with a Large-Scale Verified Dataset},
  author  = {Liu, Yifei and Zhang, Li Lyna and Zhu, Yi and Dong, Bingcheng and Zhou, Xudong and Shang, Ning and Yang, Fan and Yang, Mao},
  journal = {arXiv preprint arXiv:2505.21297},
  year    = {2025}
}

@article{xu2025kodcode,
  title   = {Kodcode: A diverse, challenging, and verifiable synthetic dataset for coding},
  author  = {Xu, Zhangchen and Liu, Yang and Yin, Yueqin and Zhou, Mingyuan and Poovendran, Radha},
  journal = {arXiv preprint arXiv:2503.02951},
  year    = {2025}
}

@misc{penedo2025codeforces,
  title        = {CodeForces CoTs},
  author       = {Guilherme Penedo and Anton Lozhkov and Hynek Kydlíček and Loubna Ben Allal and Edward Beeching and Agustín Piqueres Lajarín and Quentin Gallouédec and Nathan Habib and Lewis Tunstall and Leandro von Werra},
  year         = {2025},
  publisher    = {Hugging Face},
  journal      = {Hugging Face repository},
  howpublished = {\url{https://huggingface.co/datasets/open-r1/codeforces-cots}}
}


@article{wang2025co,
  title   = {Co-evolving llm coder and unit tester via reinforcement learning},
  author  = {Wang, Yinjie and Yang, Ling and Tian, Ye and Shen, Ke and Wang, Mengdi},
  journal = {arXiv preprint arXiv:2506.03136},
  year    = {2025}
}

@article{zhang2025swe,
  title   = {SWE-Flow: Synthesizing Software Engineering Data in a Test-Driven Manner},
  author  = {Zhang, Lei and Yang, Jiaxi and Yang, Min and Yang, Jian and Chen, Mouxiang and Zhang, Jiajun and Cui, Zeyu and Hui, Binyuan and Lin, Junyang},
  journal = {arXiv preprint arXiv:2506.09003},
  year    = {2025}
}

@article{fu2025klear,
  title   = {Klear-CodeTest: Scalable Test Case Generation for Code Reinforcement Learning},
  author  = {Fu, Jia and Yang, Xinyu and Zhang, Hongzhi and Liu, Yahui and Zhang, Jingyuan and Wang, Qi and Zhang, Fuzheng and Zhou, Guorui},
  journal = {arXiv preprint arXiv:2508.05710},
  year    = {2025}
}

@article{toshniwal2024openmathinstruct,
  title   = {Openmathinstruct-2: Accelerating ai for math with massive open-source instruction data},
  author  = {Toshniwal, Shubham and Du, Wei and Moshkov, Ivan and Kisacanin, Branislav and Ayrapetyan, Alexan and Gitman, Igor},
  journal = {arXiv preprint arXiv:2410.01560},
  year    = {2024}
}

@article{wang2025synthesizing,
  title={Synthesizing Sheet Music Problems for Evaluation and Reinforcement Learning},
  author={Wang, Zhilin and Yang, Zhe and Luo, Yun and Li, Yafu and Zhang, Haoran and Zhan, Runzhe and Wong, Derek F and Zhou, Jizhe and Cheng, Yu},
  journal={arXiv preprint arXiv:2509.04059},
  year={2025}
}

@article{albalak2025big,
  title   = {Big-math: A large-scale, high-quality math dataset for reinforcement learning in language models},
  author  = {Albalak, Alon and Phung, Duy and Lile, Nathan and Rafailov, Rafael and Gandhi, Kanishk and Castricato, Louis and Singh, Anikait and Blagden, Chase and Xiang, Violet and Mahan, Dakota and others},
  journal = {arXiv preprint arXiv:2502.17387},
  year    = {2025}
}

@article{moshkov2025aimo,
  title   = {Aimo-2 winning solution: Building state-of-the-art mathematical reasoning models with openmathreasoning dataset},
  author  = {Moshkov, Ivan and Hanley, Darragh and Sorokin, Ivan and Toshniwal, Shubham and Henkel, Christof and Schifferer, Benedikt and Du, Wei and Gitman, Igor},
  journal = {arXiv preprint arXiv:2504.16891},
  year    = {2025}
}

@article{wen2025light,
  title   = {Light-r1: Curriculum sft, dpo and rl for long cot from scratch and beyond},
  author  = {Wen, Liang and Cai, Yunke and Xiao, Fenrui and He, Xin and An, Qi and Duan, Zhenyu and Du, Yimin and Liu, Junchen and Tang, Lifu and Lv, Xiaowei and others},
  journal = {arXiv preprint arXiv:2503.10460},
  year    = {2025}
}

@article{luo2025deepscaler,
  title   = {Deepscaler: Surpassing o1-preview with a 1.5 b model by scaling rl},
  author  = {Luo, Michael and Tan, Sijun and Wong, Justin and Shi, Xiaoxiang and Tang, William Y and Roongta, Manan and Cai, Colin and Luo, Jeffrey and Zhang, Tianjun and Li, Li Erran and others},
  journal = {Notion Blog},
  year    = {2025}
}

@article{pan2024training,
  title   = {Training software engineering agents and verifiers with swe-gym},
  author  = {Pan, Jiayi and Wang, Xingyao and Neubig, Graham and Jaitly, Navdeep and Ji, Heng and Suhr, Alane and Zhang, Yizhe},
  journal = {arXiv preprint arXiv:2412.21139},
  year    = {2024}
}

@article{xie2025swe,
  title   = {Swe-fixer: Training open-source llms for effective and efficient github issue resolution},
  author  = {Xie, Chengxing and Li, Bowen and Gao, Chang and Du, He and Lam, Wai and Zou, Difan and Chen, Kai},
  journal = {arXiv preprint arXiv:2501.05040},
  year    = {2025}
}

## Synthetic Dataset JC

@article{dohare2024loss,
  title={Loss of plasticity in deep continual learning},
  author={Dohare, Shibhansh and Hernandez-Garcia, J Fernando and Lan, Qingfeng and Rahman, Parash and Mahmood, A Rupam and Sutton, Richard S},
  journal={Nature},
  volume={632},
  number={8026},
  pages={768--774},
  year={2024},
  publisher={Nature Publishing Group UK London}
}

@article{zheng2022lifelong,
  title     = {Lifelong reinforcement learning with temporal logic formulas and reward machines},
  author    = {Zheng, Xuejing and Yu, Chao and Zhang, Minjie},
  journal   = {Knowledge-Based Systems},
  volume    = {257},
  pages     = {109650},
  year      = {2022},
  publisher = {Elsevier}
}

@inproceedings{jiang2021temporal,
  title     = {Temporal-logic-based reward shaping for continuing reinforcement learning tasks},
  author    = {Jiang, Yuqian and Bharadwaj, Suda and Wu, Bo and Shah, Rishi and Topcu, Ufuk and Stone, Peter},
  booktitle = {Proceedings of the AAAI Conference on artificial Intelligence},
  volume    = {35},
  pages     = {7995--8003},
  year      = {2021}
}

@article{garcia2019meta,
  title   = {A meta-MDP approach to exploration for lifelong reinforcement learning},
  author  = {Garcia, Francisco and Thomas, Philip S},
  journal = {Advances in Neural Information Processing Systems},
  volume  = {32},
  year    = {2019}
}
@article{gaya2022building,
  title   = {Building a subspace of policies for scalable continual learning},
  author  = {Gaya, Jean-Baptiste and Doan, Thang and Caccia, Lucas and Soulier, Laure and Denoyer, Ludovic and Raileanu, Roberta},
  journal = {arXiv preprint arXiv:2211.10445},
  year    = {2022}
}

@article{berseth2021comps,
  title   = {Comps: Continual meta policy search},
  author  = {Berseth, Glen and Zhang, Zhiwei and Zhang, Grace and Finn, Chelsea and Levine, Sergey},
  journal = {arXiv preprint arXiv:2112.04467},
  year    = {2021}
}

@article{rolnick2019experience,
  title   = {Experience replay for continual learning},
  author  = {Rolnick, David and Ahuja, Arun and Schwarz, Jonathan and Lillicrap, Timothy and Wayne, Gregory},
  journal = {Advances in neural information processing systems},
  volume  = {32},
  year    = {2019}
}

@article{li2021sler,
  title     = {SLER: Self-generated long-term experience replay for continual reinforcement learning},
  author    = {Li, Chunmao and Li, Yang and Zhao, Yinliang and Peng, Peng and Geng, Xupeng},
  journal   = {Applied Intelligence},
  volume    = {51},
  number    = {1},
  pages     = {185--201},
  year      = {2021},
  publisher = {Springer}
}

@article{wolczyk2021continual,
  title   = {Continual world: A robotic benchmark for continual reinforcement learning},
  author  = {Wo{\l}czyk, Maciej and Zaj{\k{a}}c, Micha{\l} and Pascanu, Razvan and Kuci{\'n}ski, {\L}ukasz and Mi{\l}o{\'s}, Piotr},
  journal = {Advances in Neural Information Processing Systems},
  volume  = {34},
  pages   = {28496--28510},
  year    = {2021}
}

@inproceedings{todorov2012mujoco,
  title        = {Mujoco: A physics engine for model-based control},
  author       = {Todorov, Emanuel and Erez, Tom and Tassa, Yuval},
  booktitle    = {2012 IEEE/RSJ international conference on intelligent robots and systems},
  pages        = {5026--5033},
  year         = {2012},
  organization = {IEEE}
}

@article{towers2024gymnasium,
  title   = {Gymnasium: A standard interface for reinforcement learning environments},
  author  = {Towers, Mark and Kwiatkowski, Ariel and Terry, Jordan and Balis, John U and De Cola, Gianluca and Deleu, Tristan and Goul{\~a}o, Manuel and Kallinteris, Andreas and Krimmel, Markus and KG, Arjun and others},
  journal = {arXiv preprint arXiv:2407.17032},
  year    = {2024}
}

@article{chevalier2023minigrid,
  title   = {Minigrid \& miniworld: Modular \& customizable reinforcement learning environments for goal-oriented tasks},
  author  = {Chevalier-Boisvert, Maxime and Dai, Bolun and Towers, Mark and Perez-Vicente, Rodrigo and Willems, Lucas and Lahlou, Salem and Pal, Suman and Castro, Pablo Samuel and Terry, Jordan},
  journal = {Advances in Neural Information Processing Systems},
  volume  = {36},
  pages   = {73383--73394},
  year    = {2023}
}

@inproceedings{dedieu2025improving,
  title     = {Improving Transformer World Models for Data-Efficient RL},
  author    = {Dedieu, Antoine and Ortiz, Joseph and Lou, Xinghua and Wendelken, Carter and Lehrach, Wolfgang and Guntupalli, J Swaroop and Lazaro-Gredilla, Miguel and Murphy, Kevin Patrick},
  booktitle = {ICLR 2025 Workshop on World Models: Understanding, Modelling and Scaling},
  year      = {2025}
}

@article{hafner2023mastering,
  title   = {Mastering diverse domains through world models},
  author  = {Hafner, Danijar and Pasukonis, Jurgis and Ba, Jimmy and Lillicrap, Timothy},
  journal = {arXiv preprint arXiv:2301.04104},
  year    = {2023}
}

@article{russell2025gaia,
  title   = {Gaia-2: A controllable multi-view generative world model for autonomous driving},
  author  = {Russell, Lloyd and Hu, Anthony and Bertoni, Lorenzo and Fedoseev, George and Shotton, Jamie and Arani, Elahe and Corrado, Gianluca},
  journal = {arXiv preprint arXiv:2503.20523},
  year    = {2025}
}

@misc{earle2025puzzlejaxbenchmarkreasoninglearning,
  title         = {PuzzleJAX: A Benchmark for Reasoning and Learning},
  author        = {Sam Earle and Graham Todd and Yuchen Li and Ahmed Khalifa and Muhammad Umair Nasir and Zehua Jiang and Andrzej Banburski-Fahey and Julian Togelius},
  year          = {2025},
  eprint        = {2508.16821},
  archiveprefix = {arXiv},
  primaryclass  = {cs.AI},
  url           = {https://arxiv.org/abs/2508.16821}
}

@misc{synthetic2,
  title        = {SYNTHETIC-2 Release: Four Million Collaboratively Generated Reasoning Traces},
  author       = {{PrimeIntellect}},
  howpublished = {\url{https://www.primeintellect.ai/blog/synthetic-2-release#synthetic-2-dataset}},
  year         = {2025},
  note         = {Technical Report}
}

@article{li2025internbootcamp,
  title   = {InternBootcamp Technical Report: Boosting LLM Reasoning with Verifiable Task Scaling},
  author  = {Li, Peiji and Ye, Jiasheng and Chen, Yongkang and Ma, Yichuan and Yu, Zijie and Chen, Kedi and Cui, Ganqu and Li, Haozhan and Chen, Jiacheng and Lyu, Chengqi and others},
  journal = {arXiv preprint arXiv:2508.08636},
  year    = {2025}
}

@article{liu2025spiral,
  title   = {SPIRAL: Self-Play on Zero-Sum Games Incentivizes Reasoning via Multi-Agent Multi-Turn Reinforcement Learning},
  author  = {Liu, Bo and Guertler, Leon and Yu, Simon and Liu, Zichen and Qi, Penghui and Balcells, Daniel and Liu, Mickel and Tan, Cheston and Shi, Weiyan and Lin, Min and others},
  journal = {arXiv preprint arXiv:2506.24119},
  year    = {2025}
}

@article{xu2025medagentgym,
  title   = {MedAgentGym: Training LLM Agents for Code-Based Medical Reasoning at Scale},
  author  = {Xu, Ran and Zhuang, Yuchen and Zhong, Yishan and Yu, Yue and Tang, Xiangru and Wu, Hang and Wang, May D and Ruan, Peifeng and Yang, Donghan and Wang, Tao and others},
  journal = {arXiv preprint arXiv:2506.04405},
  year    = {2025}
}

@article{yang2025zerogui,
  title   = {ZeroGUI: Automating Online GUI Learning at Zero Human Cost},
  author  = {Yang, Chenyu and Su, Shiqian and Liu, Shi and Dong, Xuan and Yu, Yue and Su, Weijie and Wang, Xuehui and Liu, Zhaoyang and Zhu, Jinguo and Li, Hao and others},
  journal = {arXiv preprint arXiv:2505.23762},
  year    = {2025}
}

@article{chen2025learning,
  title   = {Learning to reason with search for llms via reinforcement learning},
  author  = {Chen, Mingyang and Li, Tianpeng and Sun, Haoze and Zhou, Yijie and Zhu, Chenzheng and Wang, Haofen and Pan, Jeff Z and Zhang, Wen and Chen, Huajun and Yang, Fan and others},
  journal = {arXiv preprint arXiv:2503.19470},
  year    = {2025}
}

@article{nathani2025mlgym,
  title   = {Mlgym: A new framework and benchmark for advancing ai research agents},
  author  = {Nathani, Deepak and Madaan, Lovish and Roberts, Nicholas and Bashlykov, Nikolay and Menon, Ajay and Moens, Vincent and Budhiraja, Amar and Magka, Despoina and Vorotilov, Vladislav and Chaurasia, Gaurav and others},
  journal = {arXiv preprint arXiv:2502.14499},
  year    = {2025}
}

@inproceedings{trivedi2024appworld,
  title     = {AppWorld: A Controllable World of Apps and People for Benchmarking Interactive Coding Agents},
  author    = {Trivedi, Harsh and Khot, Tushar and Hartmann, Mareike and Manku, Ruskin and Dong, Vinty and Li, Edward and Gupta, Shashank and Sabharwal, Ashish and Balasubramanian, Niranjan},
  booktitle = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  pages     = {16022--16076},
  year      = {2024}
}

@inproceedings{jha2025cross,
  title     = {Cross-environment Cooperation Enables Zero-shot Multi-agent Coordination},
  author    = {Jha, Kunal and Carvalho, Wilka and Liang, Yancheng and Du, Simon Shaolei and Kleiman-Weiner, Max and Jaques, Natasha},
  booktitle = {Forty-second International Conference on Machine Learning},
  year      = {2025}
}

@article{chen2025g1,
  title   = {G1: Bootstrapping Perception and Reasoning Abilities of Vision-Language Model via Reinforcement Learning},
  author  = {Chen, Liang and Gao, Hongcheng and Liu, Tianyu and Huang, Zhiqi and Sung, Flood and Zhou, Xinyu and Wu, Yuxin and Chang, Baobao},
  journal = {arXiv preprint arXiv:2505.13426},
  year    = {2025}
}

@inproceedings{wang2022scienceworld,
  title     = {ScienceWorld: Is your Agent Smarter than a 5th Grader?},
  author    = {Wang, Ruoyao and Jansen, Peter and C{\^o}t{\'e}, Marc-Alexandre and Ammanabrolu, Prithviraj},
  booktitle = {Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing},
  pages     = {11279--11298},
  year      = {2022}
}

@inproceedings{shridhar2020alfworld,
  title     = {ALFWorld: Aligning Text and Embodied Environments for Interactive Learning},
  author    = {Shridhar, Mohit and Yuan, Xingdi and Cote, Marc-Alexandre and Bisk, Yonatan and Trischler, Adam and Hausknecht, Matthew},
  booktitle = {International Conference on Learning Representations},
  year      = {2020}
}

@inproceedings{chen2025map,
  title     = {OS-MAP: How Far Can Computer Use Agents Go in Breadth and Depth?},
  author    = {Chen, Xuetian and Chen, Yinghao and Yuan, Xinfeng and Chen, Lu and Li, Yuekeng and Zhang, Zhoujia and Huang, Yingqian and Huang, Leyan and Liang, Jiaqing and Xie, Tianbao and others},
  booktitle = {ICML 2025 Workshop on Computer Use Agents},
  year      = {2025}
}

@article{zhang2025agentcpm,
  title   = {AgentCPM-GUI: Building Mobile-Use Agents with Reinforcement Fine-Tuning},
  author  = {Zhang, Zhong and Lu, Yaxi and Fu, Yikun and Huo, Yupeng and Yang, Shenzhi and Wu, Yesai and Si, Han and Cong, Xin and Chen, Haotian and Lin, Yankai and others},
  journal = {arXiv preprint arXiv:2506.01391},
  year    = {2025}
}

@article{ruhdorfer2024overcooked,
  title   = {The Overcooked Generalisation Challenge},
  author  = {Ruhdorfer, Constantin and Bortoletto, Matteo and Penzkofer, Anna and Bulling, Andreas},
  journal = {arXiv preprint arXiv:2406.17949},
  year    = {2024}
}

@article{li2025optimus,
  title   = {Optimus-3: Towards Generalist Multimodal Minecraft Agents with Scalable Task Experts},
  author  = {Li, Zaijing and Xie, Yuquan and Shao, Rui and Chen, Gongwei and Guan, Weili and Jiang, Dongmei and Nie, Liqiang},
  journal = {arXiv preprint arXiv:2506.10357},
  year    = {2025}
}

@misc{genie3,
  title  = {Genie 3: A New Frontier for World Models},
  author = {Philip J. Ball and Jakob Bauer and Frank Belletti and Bethanie Brownfield and Ariel Ephrat and et al},
  howpublished = {\url{https://deepmind.google/discover/blog/genie-3-a-new-frontier-for-world-models/}},
  year   = {2025}
}

@article{hu2025lmgame,
  title   = {lmgame-Bench: How Good are LLMs at Playing Games?},
  author  = {Hu, Lanxiang and Huo, Mingjia and Zhang, Yuxuan and Yu, Haoyang and Xing, Eric P and Stoica, Ion and Rosing, Tajana and Jin, Haojian and Zhang, Hao},
  journal = {arXiv preprint arXiv:2505.15146},
  year    = {2025}
}

@misc{wei2023asymmetry,
  title        = {The Asymmetry of Verification and Verifier's Law},
  author       = {Wei, Jason},
  year         = {2025},
  howpublished = {\url{https://www.jasonwei.net/blog/asymmetry-of-verification-and-verifiers-law}},
  note         = {Accessed: 2025-07-15}
}


@article{wang2025agentfly,
  title   = {AgentFly: Extensible and Scalable Reinforcement Learning for LM Agents},
  author  = {Wang, Renxi and Genadi, Rifo Ahmad and Bouardi, Bilal El and Wang, Yongxin and Koto, Fajri and Liu, Zhengzhong and Baldwin, Timothy and Li, Haonan},
  journal = {arXiv preprint arXiv:2507.14897},
  year    = {2025}
}

@article{shi2025korgym,
  title   = {KORGym: A Dynamic Game Platform for LLM Reasoning Evaluation},
  author  = {Shi, Jiajun and Yang, Jian and Liu, Jiaheng and Bu, Xingyuan and Chen, Jiangjie and Zhou, Junting and Ma, Kaijing and Wen, Zhoufutu and Wang, Bingli and He, Yancheng and others},
  journal = {arXiv preprint arXiv:2505.14552},
  year    = {2025}
}

@article{wu2025synthrl,
  title   = {SynthRL: Scaling Visual Reasoning with Verifiable Data Synthesis},
  author  = {Wu, Zijian and Ni, Jinjie and Liu, Xiangyan and Liu, Zichen and Yan, Hang and Shieh, Michael Qizhe},
  journal = {arXiv preprint arXiv:2506.02096},
  year    = {2025}
}

@article{xie2025play,
  title   = {Play to Generalize: Learning to Reason Through Game Play},
  author  = {Xie, Yunfei and Ma, Yinsong and Lan, Shiyi and Yuille, Alan and Xiao, Junfei and Wei, Chen},
  journal = {arXiv preprint arXiv:2506.08011},
  year    = {2025}
}

@article{jain2025r2e,
  title   = {R2e-gym: Procedural environments and hybrid verifiers for scaling open-weights swe agents},
  author  = {Jain, Naman and Singh, Jaskirat and Shetty, Manish and Zheng, Liang and Sen, Koushik and Stoica, Ion},
  journal = {arXiv preprint arXiv:2504.07164},
  year    = {2025}
}

@article{liang2025sws,
  title   = {SwS: Self-aware Weakness-driven Problem Synthesis in Reinforcement Learning for LLM Reasoning},
  author  = {Liang, Xiao and Li, Zhong-Zhi and Gong, Yeyun and Wang, Yang and Zhang, Hengyuan and Shen, Yelong and Wu, Ying Nian and Chen, Weizhu},
  journal = {arXiv preprint arXiv:2506.08989},
  year    = {2025}
}

@article{he2025protoreasoning,
  title   = {ProtoReasoning: Prototypes as the Foundation for Generalizable Reasoning in LLMs},
  author  = {He, Feng and Chen, Zijun and Liang, Xinnian and Ma, Tingting and Qiu, Yunqi and Wu, Shuangzhi and Yan, Junchi},
  journal = {arXiv preprint arXiv:2506.15211},
  year    = {2025}
}

@article{badertdinov2025swe,
  title   = {SWE-rebench: An Automated Pipeline for Task Collection and Decontaminated Evaluation of Software Engineering Agents},
  author  = {Badertdinov, Ibragim and Golubev, Alexander and Nekrashevich, Maksim and Shevtsov, Anton and Karasik, Simon and Andriushchenko, Andrei and Trofimova, Maria and Litvintseva, Daria and Yangel, Boris},
  journal = {arXiv preprint arXiv:2505.20411},
  year    = {2025}
}

@article{goldie2025synthetic,
  title   = {Synthetic data generation \& multi-step rl for reasoning \& tool use},
  author  = {Goldie, Anna and Mirhoseini, Azalia and Zhou, Hao and Cai, Irene and Manning, Christopher D},
  journal = {arXiv preprint arXiv:2504.04736},
  year    = {2025}
}

@article{akter2025nemotron,
  title   = {Nemotron-crossthink: Scaling self-learning beyond math reasoning},
  author  = {Akter, Syeda Nahida and Prabhumoye, Shrimai and Novikov, Matvei and Han, Seungju and Lin, Ying and Bakhturina, Evelina and Nyberg, Eric and Choi, Yejin and Patwary, Mostofa and Shoeybi, Mohammad and others},
  journal = {arXiv preprint arXiv:2504.13941},
  year    = {2025}
}

@article{chen2025enigmata,
  title   = {Enigmata: Scaling Logical Reasoning in Large Language Models with Synthetic Verifiable Puzzles},
  author  = {Chen, Jiangjie and He, Qianyu and Yuan, Siyu and Chen, Aili and Cai, Zhicheng and Dai, Weinan and Yu, Hongli and Yu, Qiying and Li, Xuefeng and Chen, Jiaze and others},
  journal = {arXiv preprint arXiv:2505.19914},
  year    = {2025}
}
@inproceedings{bruce2024genie,
  title     = {Genie: Generative interactive environments},
  author    = {Bruce, Jake and Dennis, Michael D and Edwards, Ashley and Parker-Holder, Jack and Shi, Yuge and Hughes, Edward and Lai, Matthew and Mavalankar, Aditi and Steigerwald, Richie and Apps, Chris and others},
  booktitle = {Forty-first International Conference on Machine Learning},
  year      = {2024}
}

@article{xie2025logic,
  title   = {Logic-rl: Unleashing llm reasoning with rule-based reinforcement learning},
  author  = {Xie, Tian and Gao, Zitian and Ren, Qingnan and Luo, Haoming and Hong, Yuqian and Dai, Bryan and Zhou, Joey and Qiu, Kai and Wu, Zhirong and Luo, Chong},
  journal = {arXiv preprint arXiv:2502.14768},
  year    = {2025}
}

@misc{guertler2025textarena,
  title         = {TextArena},
  author        = {Leon Guertler and Bobby Cheng and Simon Yu and Bo Liu and Leshem Choshen and Cheston Tan},
  year          = {2025},
  eprint        = {2504.11442},
  archiveprefix = {arXiv},
  primaryclass  = {cs.CL},
  url           = {https://arxiv.org/abs/2504.11442}
}


@article{stojanovski2025reasoning,
  title   = {REASONING GYM: Reasoning Environments for Reinforcement Learning with Verifiable Rewards},
  author  = {Stojanovski, Zafir and Stanley, Oliver and Sharratt, Joe and Jones, Richard and Adefioye, Abdulhakeem and Kaddour, Jean and K{\"o}pf, Andreas},
  journal = {arXiv preprint arXiv:2505.24760},
  year    = {2025}
}

@article{qiang2025mle,
  title   = {Mle-dojo: Interactive environments for empowering llm agents in machine learning engineering},
  author  = {Qiang, Rushi and Zhuang, Yuchen and Li, Yinghao and Zhang, Rongzhi and Li, Changhao and Wong, Ian Shu-Hei and Yang, Sherry and Liang, Percy and Zhang, Chao and Dai, Bo and others},
  journal = {arXiv preprint arXiv:2505.07782},
  year    = {2025}
}



@online{AlphaEvolveBlog,
  author    = {The DeepMind Team},
  title     = {AlphaEvolve: A Gemini-powered coding agent for designing advanced algorithms},
  year      = {2025},
  url       = {https://deepmind.google/discover/blog/alphaevolve-a-gemini-powered-coding-agent-for-designing-advanced-algorithms/},
  urldate   = {2025-05-14},
  publisher = {DeepMind},
  note      = {Blog post}
}

@article{zhou2025sweet,
  title   = {Sweet-rl: Training multi-turn llm agents on collaborative reasoning tasks},
  author  = {Zhou, Yifei and Jiang, Song and Tian, Yuandong and Weston, Jason and Levine, Sergey and Sukhbaatar, Sainbayar and Li, Xian},
  journal = {arXiv preprint arXiv:2503.15478},
  year    = {2025}
}

@article{su2025crossing,
  title   = {Crossing the Reward Bridge: Expanding RL with Verifiable Rewards Across Diverse Domains},
  author  = {Su, Yi and Yu, Dian and Song, Linfeng and Li, Juntao and Mi, Haitao and Tu, Zhaopeng and Zhang, Min and Yu, Dong},
  journal = {arXiv preprint arXiv:2503.23829},
  year    = {2025}
}

@article{Risi2020Increasing,
  author  = {Risi, Sebastian and Togelius, Julian},
  title   = {Increasing generality in machine learning through procedural content generation},
  journal = {Nature Machine Intelligence},
  year    = {2020},
  volume  = {2},
  pages   = {428--436},
  doi     = {10.1038/s42256-020-0208-z},
  url     = {https://doi.org/10.1038/s42256-020-0208-z}
}

@article{zhu2025autologi,
  title   = {AutoLogi: Automated generation of logic puzzles for evaluating reasoning abilities of large language models},
  author  = {Zhu, Qin and Huang, Fei and Peng, Runyu and Lu, Keming and Yu, Bowen and Cheng, Qinyuan and Qiu, Xipeng and Huang, Xuanjing and Lin, Junyang},
  journal = {arXiv preprint arXiv:2502.16906},
  year    = {2025}
}

@article{li2025wildlong,
  title   = {Wildlong: Synthesizing realistic long-context instruction data at scale},
  author  = {Li, Jiaxi and Zhang, Xingxing and Wang, Xun and Huang, Xiaolong and Dong, Li and Wang, Liang and Chen, Si-Qing and Lu, Wei and Wei, Furu},
  journal = {arXiv preprint arXiv:2502.16684},
  year    = {2025}
}

@inproceedings{yang2025coast,
  title     = {COAST: Enhancing the Code Debugging Ability of LLMs through Communicative Agent Based Data Synthesis},
  author    = {Yang, Weiqing and Wang, Hanbin and Liu, Zhenghao and Li, Xinze and Yan, Yukun and Wang, Shuo and Gu, Yu and Yu, Minghe and Liu, Zhiyuan and Yu, Ge},
  booktitle = {Findings of the Association for Computational Linguistics: NAACL 2025},
  pages     = {2570--2585},
  year      = {2025}
}

@article{xu2025mindgym,
  title   = {MindGYM: What Matters in Question Synthesis for Thinking-Centric Fine-Tuning?},
  author  = {Xu, Zhe and Chen, Daoyuan and Ling, Zhenqing and Li, Yaliang and Shen, Ying},
  journal = {arXiv preprint arXiv:2503.09499},
  year    = {2025}
}

@inproceedings{hu2025agentgen,
  title     = {AgentGen: Enhancing Planning Abilities for Large Language Model based Agent via Environment and Task Generation},
  author    = {Hu, Mengkang and Zhao, Pu and Xu, Can and Sun, Qingfeng and Lou, Jian-Guang and Lin, Qingwei and Luo, Ping and Rajmohan, Saravan},
  booktitle = {Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V. 1},
  pages     = {496--507},
  year      = {2025}
}

@inproceedings{hongjinlearn,
  title     = {Learn-by-interact: A Data-Centric Framework For Self-Adaptive Agents in Realistic Environments},
  author    = {Hongjin, SU and Sun, Ruoxi and Yoon, Jinsung and Yin, Pengcheng and Yu, Tao and Arik, Sercan O},
  booktitle = {The Thirteenth International Conference on Learning Representations},
  year      = {2025}
}

@article{tong2025code2logic,
  title   = {Code2Logic: Game-Code-Driven Data Synthesis for Enhancing VLMs General Reasoning},
  author  = {Tong, Jingqi and Tang, Jixin and Li, Hangcheng and Mou, Yurong and Zhang, Ming and Zhao, Jun and Wen, Yanbo and Song, Fan and Zhan, Jiahao and Lu, Yuyang and others},
  journal = {arXiv preprint arXiv:2505.13886},
  year    = {2025}
}



## Reward Shaping and Off-Policy Learning

@article{huang2025blending,
  title   = {Blending Supervised and Reinforcement Fine-Tuning with Prefix Sampling},
  author  = {Huang, Zeyu and Cheng, Tianhao and Qiu, Zihan and Wang, Zili and Xu, Yinghui and Ponti, Edoardo M and Titov, Ivan},
  journal = {arXiv preprint arXiv:2507.01679},
  year    = {2025}
}


@article{cohen2025soft,
  title   = {Soft Policy Optimization: Online Off-Policy RL for Sequence Models},
  author  = {Cohen, Taco and Zhang, David W and Zheng, Kunhao and Tang, Yunhao and Munos, Remi and Synnaeve, Gabriel},
  journal = {arXiv preprint arXiv:2503.05453},
  year    = {2025}
}



@article{liang2025squeeze,
  title   = {Squeeze the Soaked Sponge: Efficient Off-policy Reinforcement Finetuning for Large Language Model},
  author  = {Liang, Jing and Tang, Hongyao and Ma, Yi and Liu, Jinyi and Zheng, Yan and Hu, Shuyue and Bai, Lei and Hao, Jianye},
  journal = {arXiv preprint arXiv:2507.06892},
  year    = {2025}
}

# RL Applications
## Vision and Vision Language Tasks


## Robotics RL

@article{firoozi2025foundation,
  title     = {Foundation models in robotics: Applications, challenges, and the future},
  author    = {Firoozi, Roya and Tucker, Johnathan and Tian, Stephen and Majumdar, Anirudha and Sun, Jiankai and Liu, Weiyu and Zhu, Yuke and Song, Shuran and Kapoor, Ashish and Hausman, Karol and others},
  journal   = {The International Journal of Robotics Research},
  volume    = {44},
  number    = {5},
  pages     = {701--739},
  year      = {2025},
  publisher = {SAGE Publications Sage UK: London, England}
}

@article{zhong2025survey,
  title   = {A Survey on Vision-Language-Action Models: An Action Tokenization Perspective},
  author  = {Zhong, Yifan and Bai, Fengshuo and Cai, Shaofei and Huang, Xuchuan and Chen, Zhang and Zhang, Xiaowei and Wang, Yuanfei and Guo, Shaoyang and Guan, Tianrui and Lui, Ka Nam and others},
  journal = {arXiv preprint arXiv:2507.01925},
  year    = {2025}
}

@article{sapkota2025vision,
  title   = {Vision-language-action models: Concepts, progress, applications and challenges},
  author  = {Sapkota, Ranjan and Cao, Yang and Roumeliotis, Konstantinos I and Karkee, Manoj},
  journal = {arXiv preprint arXiv:2505.04769},
  year    = {2025}
}


@inproceedings{o2024open,
  title        = {Open x-embodiment: Robotic learning datasets and rt-x models: Open x-embodiment collaboration 0},
  author       = {O’Neill, Abby and Rehman, Abdul and Maddukuri, Abhiram and Gupta, Abhishek and Padalkar, Abhishek and Lee, Abraham and Pooley, Acorn and Gupta, Agrim and Mandlekar, Ajay and Jain, Ajinkya and others},
  booktitle    = {2024 IEEE International Conference on Robotics and Automation (ICRA)},
  pages        = {6892--6903},
  year         = {2024},
  organization = {IEEE}
}


## Code Generation

### Coding Tasks-TIR
@article{wei2025autotir,
  title   = {AutoTIR: Autonomous Tools Integrated Reasoning via Reinforcement Learning},
  author  = {Wei, Yifan and Yu, Xiaoyan and Weng, Yixuan and Pan, Tengfei and Li, Angsheng and Du, Li},
  journal = {arXiv preprint arXiv:2507.21836},
  year    = {2025}
}

@article{li2025cort,
  title   = {CoRT: Code-integrated Reasoning within Thinking},
  author  = {Li, Chengpeng and Tang, Zhengyang and Li, Ziniu and Xue, Mingfeng and Bao, Keqin and Ding, Tian and Sun, Ruoyu and Wang, Benyou and Wang, Xiang and Lin, Junyang and others},
  journal = {arXiv preprint arXiv:2506.09820},
  year    = {2025}
}



@article{li2025torl,
  title   = {Torl: Scaling tool-integrated rl},
  author  = {Li, Xuefeng and Zou, Haoyang and Liu, Pengfei},
  journal = {arXiv preprint arXiv:2503.23383},
  year    = {2025}
}

@article{dong2025agentic,
  title   = {Agentic Reinforced Policy Optimization},
  author  = {Dong, Guanting and Mao, Hangyu and Ma, Kai and Bao, Licheng and Chen, Yifei and Wang, Zhongyuan and Chen, Zhongxia and Du, Jiazhen and Wang, Huiyang and Zhang, Fuzheng and others},
  journal = {arXiv preprint arXiv:2507.19849},
  year    = {2025}
}


## Autonomous Agents


## Medical Reasoning
@article{zhang2025med,
  title   = {Med-rlvr: Emerging medical reasoning from a 3b base model via reinforcement learning},
  author  = {Zhang, Sheng and Liu, Qianchu and Qin, Guanghui and Naumann, Tristan and Poon, Hoifung},
  journal = {arXiv preprint arXiv:2502.19655},
  year    = {2025}
}

@misc{chen2024huatuogpto1medicalcomplexreasoning,
  title         = {HuatuoGPT-o1, Towards Medical Complex Reasoning with LLMs},
  author        = {Junying Chen and Zhenyang Cai and Ke Ji and Xidong Wang and Wanlong Liu and Rongsheng Wang and Jianye Hou and Benyou Wang},
  year          = {2024},
  eprint        = {2412.18925},
  archiveprefix = {arXiv},
  primaryclass  = {cs.CL},
  url           = {https://arxiv.org/abs/2412.18925}
}

@article{zhang2025med-u1,
  title   = {Med-U1: Incentivizing Unified Medical Reasoning in LLMs via Large-scale Reinforcement Learning},
  author  = {Zhang, Xiaotian and Wang, Yuan and Feng, Zhaopeng and Chen, Ruizhe and Zhou, Zhijie and Zhang, Yan and Xu, Hongxia and Wu, Jian and Liu, Zuozhu},
  journal = {arXiv preprint arXiv:2506.12307},
  year    = {2025}
}

@misc{yang2025medreflmedicalreasoningenhancement,
  title         = {Med-REFL: Medical Reasoning Enhancement via Self-Corrected Fine-grained Reflection},
  author        = {Zongxian Yang and Jiayu Qian and Zegao Peng and Haoyu Zhang and Zhi-An Huang},
  year          = {2025},
  eprint        = {2506.13793},
  archiveprefix = {arXiv},
  primaryclass  = {cs.AI},
  url           = {https://arxiv.org/abs/2506.13793}
}

@misc{qiu2025openmedicalr1choosedatarlvr,
  title         = {Open-Medical-R1: How to Choose Data for RLVR Training at Medicine Domain},
  author        = {Zhongxi Qiu and Zhang Zhang and Yan Hu and Heng Li and Jiang Liu},
  year          = {2025},
  eprint        = {2504.13950},
  archiveprefix = {arXiv},
  primaryclass  = {cs.LG},
  url           = {https://arxiv.org/abs/2504.13950}
}

@misc{dai2025qoqmedbuildingmultimodalclinical,
  title         = {QoQ-Med: Building Multimodal Clinical Foundation Models with Domain-Aware GRPO Training},
  author        = {Wei Dai and Peilin Chen and Chanakya Ekbote and Paul Pu Liang},
  year          = {2025},
  eprint        = {2506.00711},
  archiveprefix = {arXiv},
  primaryclass  = {cs.LG},
  url           = {https://arxiv.org/abs/2506.00711}
}

@misc{yu2025finemedlmo1enhancingmedicalknowledge,
  title         = {FineMedLM-o1: Enhancing Medical Knowledge Reasoning Ability of LLM from Supervised Fine-Tuning to Test-Time Training},
  author        = {Hongzhou Yu and Tianhao Cheng and Yingwen Wang and Wen He and Qing Wang and Ying Cheng and Yuejie Zhang and Rui Feng and Xiaobo Zhang},
  year          = {2025},
  eprint        = {2501.09213},
  archiveprefix = {arXiv},
  primaryclass  = {cs.CL},
  url           = {https://arxiv.org/abs/2501.09213}
}
@misc{arora2025gazalr1,
  title         = {Gazal-R1: Scaling medical reasoning with GRPO and multi-component reward design},
  author        = {Pranav Arora and Rohan Gupta and Kavya Patel},
  year          = {2025},
  eprint        = {2506.21594},
  archiveprefix = {arXiv},
  primaryclass  = {cs.CL},
  url           = {https://arxiv.org/abs/2506.21594}
}


@misc{pan2025medvlmr1,
  title         = {MedVLM-R1: Incentivizing Medical Reasoning Capability of Vision-Language Models (VLMs) via Reinforcement Learning},
  author        = {Jiazhen Pan and Che Liu and Junde Wu},
  year          = {2025},
  eprint        = {2502.19634},
  archiveprefix = {arXiv},
  primaryclass  = {cs.CV},
  url           = {https://arxiv.org/abs/2502.19634}
}

@misc{xu2025medgroundr1,
  title         = {MedGround-R1: Advancing Medical Image Grounding via Spatial-Semantic Rewarded Group Relative Policy Optimization},
  author        = {Huihui Xu and Yuanpeng Nie},
  year          = {2025},
  eprint        = {2507.02994},
  archiveprefix = {arXiv},
  primaryclass  = {cs.LG},
  url           = {https://arxiv.org/abs/2507.02994}
}

@article{liu2025efficientvie,
  title={Efficient Medical VIE via Reinforcement Learning},
  author={Liu, Lijun and Li, Ruiyang and Liu, Zhaocheng and Zhu, Chenglin and Li, Chong and Cheng, Jiehan and Ju, Qiang and Xie, Jian},
  journal={arXiv preprint arXiv:2506.13363},
  year={2025}
}

@misc{wang2025drg,
  title         = {Reinforcement Learning for Out-of-Distribution Reasoning in LLMs: An Empirical Study on Diagnosis-Related Group Coding},
  author        = {Hanyin Wang},
  year          = {2025},
  eprint        = {2505.21908},
  archiveprefix = {arXiv},
  primaryclass  = {cs.LG},
  url           = {https://arxiv.org/abs/2505.21908}
}

@misc{lin2025ehrmind,
  title         = {Training LLMs for EHR-Based Reasoning Tasks via Reinforcement Learning},
  author        = {Jiacheng Lin and Zhenbang Wu},
  year          = {2025},
  eprint        = {2505.24105},
  archiveprefix = {arXiv},
  primaryclass  = {cs.CL},
  url           = {https://arxiv.org/abs/2505.24105}
}

@article{fan2025chestx,
  title   = {ChestX-Reasoner: Advancing Radiology Foundation Models with Reasoning through Step-by-Step Verification},
  author  = {Fan, Ziqing and Liang, Cheng and Wu, Chaoyi and Zhang, Ya and Wang, Yanfeng and Xie, Weidi},
  journal = {arXiv preprint arXiv:2504.20930},
  year    = {2025}
}

@misc{li2025cxmind,
  title         = {CX-Mind: A Pioneering Multimodal Large Language Model for Interleaved Reasoning in Chest X-ray via Curriculum-Guided Reinforcement Learning},
  author        = {Wenjie Li and Yujie Zhang and Haoran Sun},
  year          = {2025},
  eprint        = {2508.03733},
  archiveprefix = {arXiv},
  primaryclass  = {cs.LG},
  url           = {https://arxiv.org/abs/2508.03733}
}

@misc{nusrat2025dola,
  title         = {Autonomous Radiotherapy Treatment Planning Using DOLA: A Privacy-Preserving, LLM-Based Optimization Agent},
  author        = {Humza Nusrat},
  year          = {2025},
  eprint        = {2503.17553},
  archiveprefix = {arXiv},
  primaryclass  = {physics.med-ph},
  url           = {https://arxiv.org/abs/2503.17553}
}

@misc{baniharouni2025lacdm,
  title         = {Language Agents for Hypothesis-driven Clinical Decision Making with Reinforcement Learning},
  author        = {David Bani-Harouni},
  year          = {2025},
  eprint        = {2506.13474},
  archiveprefix = {arXiv},
  primaryclass  = {cs.CL},
  url           = {https://arxiv.org/abs/2506.13474}
}

@misc{sun2025ppme,
  title         = {Improving Interactive Diagnostic Ability of a Large Language Model Agent Through Clinical Experience Learning},
  author        = {Zhoujian Sun and Ziyi Liu and Cheng Luo and Jiebin Chu and Zhengxing Huang},
  year          = {2025},
  eprint        = {2503.16463},
  archiveprefix = {arXiv},
  primaryclass  = {cs.AI},
  url           = {https://arxiv.org/abs/2503.16463}
}

@misc{lim2025moreclear,
  title         = {MORE-CLEAR: Multimodal Offline Reinforcement learning for Clinical notes Leveraged Enhanced State Representation},
  author        = {Yooseok Lim, ByoungJun Jeon},
  year          = {2025},
  eprint        = {2508.07681},
  archiveprefix = {arXiv},
  primaryclass  = {cs.LG},
  url           = {https://arxiv.org/abs/2508.07681}
}


@article{liu2025armed,
  title={Breaking Reward Collapse: Adaptive Reinforcement for Open-ended Medical Reasoning with Enhanced Semantic Discrimination},
  author={Liu, Yizhou and Wei, Jingwei and Chen, Zizhi and Han, Minghao and Zhang, Xukun and Liu, Keliang and Zhang, Lihua},
  journal={arXiv preprint arXiv:2508.12957},
  year={2025}
}

@misc{ding2025promed,
  title         = {ProMed: Shapley Information Gain Guided Reinforcement Learning for Proactive Medical LLMs},
  author        = {Hongxin Ding and Baixiang Huang and Yue Fang},
  year          = {2025},
  eprint        = {2508.13514},
  archiveprefix = {arXiv},
  primaryclass  = {cs.CL},
  url           = {https://arxiv.org/abs/2508.13514}
}

## Other Domain-Specific


## RL in multimodal understanding (xinwei)
@article{li2025imagine,
  title   = {Imagine while reasoning in space: Multimodal visualization-of-thought},
  author  = {Li, Chengzu and Wu, Wenshan and Zhang, Huanyu and Xia, Yan and Mao, Shaoguang and Dong, Li and Vuli{\'c}, Ivan and Wei, Furu},
  journal = {arXiv preprint arXiv:2501.07542},
  year    = {2025}
}

@article{su2025openthinkimg,
  title   = {Openthinkimg: Learning to think with images via visual tool reinforcement learning},
  author  = {Su, Zhaochen and Li, Linjie and Song, Mingyang and Hao, Yunzhuo and Yang, Zhengyuan and Zhang, Jun and Chen, Guanjie and Gu, Jiawei and Li, Juntao and Qu, Xiaoye and others},
  journal = {arXiv preprint arXiv:2505.08617},
  year    = {2025}
}


@article{zhang2025chain,
  title   = {Chain-of-Focus: Adaptive Visual Search and Zooming for Multimodal Reasoning via RL},
  author  = {Zhang, Xintong and Gao, Zhi and Zhang, Bofei and Li, Pengxiang and Zhang, Xiaowen and Liu, Yang and Yuan, Tao and Wu, Yuwei and Jia, Yunde and Zhu, Song-Chun and others},
  journal = {arXiv preprint arXiv:2505.15436},
  year    = {2025}
}


@article{xu2025viarl,
  title   = {ViaRL: Adaptive Temporal Grounding via Visual Iterated Amplification Reinforcement Learning},
  author  = {Xu, Ziqiang and Dai, Qi and Xie, Tian and Yang, Yifan and Qiu, Kai and Chen, DongDong and Wu, Zuxuan and Luo, Chong},
  journal = {arXiv preprint arXiv:2505.15447},
  year    = {2025}
}


@article{chung2025don,
  title   = {Don't Look Only Once: Towards Multimodal Interactive Reasoning with Selective Visual Revisitation},
  author  = {Chung, Jiwan and Kim, Junhyeok and Kim, Siyeol and Lee, Jaeyoung and Kim, Min Soo and Yu, Youngjae},
  journal = {arXiv preprint arXiv:2505.18842},
  year    = {2025}
}

@article{wang2025time,
  title   = {Time-R1: Post-Training Large Vision Language Model for Temporal Video Grounding},
  author  = {Wang, Ye and Wang, Ziheng and Xu, Boshen and Du, Yang and Lin, Kejun and Xiao, Zihan and Yue, Zihao and Ju, Jianzhong and Zhang, Liang and Yang, Dingyi and others},
  journal = {arXiv preprint arXiv:2503.13377},
  year    = {2025}
}


@article{zheng2025deepeyes,
  title   = {DeepEyes: Incentivizing" Thinking with Images" via Reinforcement Learning},
  author  = {Zheng, Ziwei and Yang, Michael and Hong, Jack and Zhao, Chenxiao and Xu, Guohai and Yang, Le and Shen, Chao and Yu, Xing},
  journal = {arXiv preprint arXiv:2505.14362},
  year    = {2025}
}


@article{su2025pixel,
  title   = {Pixel reasoner: Incentivizing pixel-space reasoning with curiosity-driven reinforcement learning},
  author  = {Su, Alex and Wang, Haozhe and Ren, Weiming and Lin, Fangzhen and Chen, Wenhu},
  journal = {arXiv preprint arXiv:2505.15966},
  year    = {2025}
}


@article{cheng2025video,
  title   = {Video-Holmes: Can MLLM Think Like Holmes for Complex Video Reasoning?},
  author  = {Cheng, Junhao and Ge, Yuying and Wang, Teng and Ge, Yixiao and Liao, Jing and Shan, Ying},
  journal = {arXiv preprint arXiv:2505.21374},
  year    = {2025}
}


@article{zhang2025deep,
  title   = {Deep Video Discovery: Agentic Search with Tool Use for Long-form Video Understanding},
  author  = {Zhang, Xiaoyi and Jia, Zhaoyang and Guo, Zongyu and Li, Jiahao and Li, Bin and Li, Houqiang and Lu, Yan},
  journal = {arXiv preprint arXiv:2505.18079},
  year    = {2025}
}

@article{tian2025ego,
  title   = {Ego-R1: Chain-of-Tool-Thought for Ultra-Long Egocentric Video Reasoning},
  author  = {Tian, Shulin and Wang, Ruiqi and Guo, Hongming and Wu, Penghao and Dong, Yuhao and Wang, Xiuying and Yang, Jingkang and Zhang, Hao and Zhu, Hongyuan and Liu, Ziwei},
  journal = {arXiv preprint arXiv:2506.13654},
  year    = {2025}
}

@article{yang2025machine,
  title   = {Machine Mental Imagery: Empower Multimodal Reasoning with Latent Visual Tokens},
  author  = {Yang, Zeyuan and Yu, Xueyang and Chen, Delin and Shen, Maohao and Gan, Chuang},
  journal = {arXiv preprint arXiv:2506.17218},
  year    = {2025}
}

@article{dineen2025qa,
  title={QA-LIGN: Aligning LLMs through Constitutionally Decomposed QA},
  author={Dineen, Jacob and RRV, Aswin and Liu, Qin and Xu, Zhikun and Ye, Xiao and Shen, Ming and Li, Zhaonan and Lu, Shijie and Baral, Chitta and Chen, Muhao and others},
  journal={arXiv preprint arXiv:2506.08123},
  year={2025}
}

@article{wu2025mmsearch,
  title   = {MMSearch-R1: Incentivizing LMMs to Search},
  author  = {Wu, Jinming and Deng, Zihao and Li, Wei and Liu, Yiding and You, Bo and Li, Bo and Ma, Zejun and Liu, Ziwei},
  journal = {arXiv preprint arXiv:2506.20670},
  year    = {2025}
}

@article{chen2023autoagents,
  title   = {Autoagents: A framework for automatic agent generation},
  author  = {Chen, Guangyao and Dong, Siwei and Shu, Yu and Zhang, Ge and Sesay, Jaward and Karlsson, B{\"o}rje F and Fu, Jie and Shi, Yemin},
  journal = {arXiv preprint arXiv:2309.17288},
  year    = {2023}
}

@article{hong2023metagpt,
  title   = {Metagpt: Meta programming for multi-agent collaborative framework},
  author  = {Hong, Sirui and Zheng, Xiawu and Chen, Jonathan and Cheng, Yuheng and Wang, Jinlin and Zhang, Ceyao and Wang, Zili and Yau, Steven Ka Shing and Lin, Zijuan and Zhou, Liyang and others},
  journal = {arXiv preprint arXiv:2308.00352},
  volume  = {3},
  number  = {4},
  pages   = {6},
  year    = {2023}
}

@inproceedings{wu2024autogen,
  title     = {Autogen: Enabling next-gen LLM applications via multi-agent conversations},
  author    = {Wu, Qingyun and Bansal, Gagan and Zhang, Jieyu and Wu, Yiran and Li, Beibin and Zhu, Erkang and Jiang, Li and Zhang, Xiaoyun and Zhang, Shaokun and Liu, Jiale and others},
  booktitle = {First Conference on Language Modeling},
  year      = {2024}
}

@article{li2023camel,
  title   = {Camel: Communicative agents for" mind" exploration of large language model society},
  author  = {Li, Guohao and Hammoud, Hasan and Itani, Hani and Khizbullin, Dmitrii and Ghanem, Bernard},
  journal = {Advances in Neural Information Processing Systems},
  volume  = {36},
  pages   = {51991--52008},
  year    = {2023}
}

@article{novikov2025alphaevolve,
  title   = {AlphaEvolve: A coding agent for scientific and algorithmic discovery},
  author  = {Novikov, Alexander and V{\~u}, Ng{\^a}n and Eisenberger, Marvin and Dupont, Emilien and Huang, Po-Sen and Wagner, Adam Zsolt and Shirobokov, Sergey and Kozlovskii, Borislav and Ruiz, Francisco JR and Mehrabian, Abbas and others},
  journal = {arXiv preprint arXiv:2506.13131},
  year    = {2025}
}

@misc{liang2025openmanus,
  title  = {Openmanus: An open-source framework for building general ai agents},
  author = {Liang, Xinbin and Xiang, Jinyu and Yu, Zhaoyang and Zhang, Jiayi and Hong, Sirui and Fan, Sheng and Tang, Xiao},
  year   = {2025}
}

@article{slumbers2023leveraging,
  title   = {Leveraging large language models for optimised coordination in textual multi-agent reinforcement learning},
  author  = {Slumbers, Oliver and Mguni, David Henry and Shao, Kun and Wang, Jun},
  journal = {arXiv},
  year    = {2023}
}

@article{zhang2024mutual,
  title   = {Mutual theory of mind in human-ai collaboration: An empirical study with llm-driven ai agents in a real-time shared workspace task},
  author  = {Zhang, Shao and Wang, Xihuai and Zhang, Wenhao and Chen, Yongshan and Gao, Landi and Wang, Dakuo and Zhang, Weinan and Wang, Xinbing and Wen, Ying},
  journal = {arXiv preprint arXiv:2409.08811},
  year    = {2024}
}

@article{wan2025rema,
  title   = {Rema: Learning to meta-think for llms with multi-agent reinforcement learning},
  author  = {Wan, Ziyu and Li, Yunxiang and Wen, Xiaoyu and Song, Yan and Wang, Hanjing and Yang, Linyi and Schmidt, Mark and Wang, Jun and Zhang, Weinan and Hu, Shuyue and others},
  journal = {arXiv preprint arXiv:2503.09501},
  year    = {2025}
}

@article{park2025maporl,
  title   = {Maporl: Multi-agent post-co-training for collaborative large language models with reinforcement learning},
  author  = {Park, Chanwoo and Han, Seungju and Guo, Xingzhi and Ozdaglar, Asuman and Zhang, Kaiqing and Kim, Joo-Kyung},
  journal = {arXiv preprint arXiv:2502.18439},
  year    = {2025}
}

@article{lu2025preference,
  title     = {A Preference-Based Multi-Agent Federated Reinforcement Learning Algorithm Framework for Trustworthy Interactive Urban Autonomous Driving},
  author    = {Lu, Sikai and Cai, Yingfeng and Liu, Ze and Lian, Yubo and Chen, Long and Wang, Hai},
  journal   = {IEEE Transactions on Intelligent Transportation Systems},
  year      = {2025},
  publisher = {IEEE}
}

@article{thind2025optimai,
  title   = {OptimAI: Optimization from Natural Language Using LLM-Powered AI Agents},
  author  = {Thind, Raghav and Sun, Youran and Liang, Ling and Yang, Haizhao},
  journal = {arXiv preprint arXiv:2504.16918},
  year    = {2025}
}

@article{wang2025genai,
  title   = {GenAI-based Multi-Agent Reinforcement Learning towards Distributed Agent Intelligence: A Generative-RL Agent Perspective},
  author  = {Wang, Hang and Zhang, Junshan},
  journal = {arXiv preprint arXiv:2507.09495},
  year    = {2025}
}

@article{shi2025aime,
  title   = {Aime: Towards Fully-Autonomous Multi-Agent Framework},
  author  = {Shi, Yexuan and Wang, Mingyu and Cao, Yunxiang and Lai, Hongjie and Lan, Junjian and Han, Xin and Wang, Yu and Geng, Jie and Li, Zhenan and Xia, Zihao and others},
  journal = {arXiv preprint arXiv:2507.11988},
  year    = {2025}
}



@article{jin2025search,
  title   = {Search-r1: Training llms to reason and leverage search engines with reinforcement learning},
  author  = {Jin, Bowen and Zeng, Hansi and Yue, Zhenrui and Yoon, Jinsung and Arik, Sercan and Wang, Dong and Zamani, Hamed and Han, Jiawei},
  journal = {arXiv preprint arXiv:2503.09516},
  year    = {2025}
}

@article{huang2025cogddn,
  title   = {CogDDN: A Cognitive Demand-Driven Navigation with Decision Optimization and Dual-Process Thinking},
  author  = {Huang, Yuehao and Liu, Liang and Lei, Shuangming and Ma, Yukai and Su, Hao and Mei, Jianbiao and Zhao, Pengxiang and Gu, Yaqing and Liu, Yong and Lv, Jiajun},
  journal = {arXiv preprint arXiv:2507.11334},
  year    = {2025}
}

@article{song2025r1,
  title   = {R1-searcher: Incentivizing the search capability in llms via reinforcement learning},
  author  = {Song, Huatong and Jiang, Jinhao and Min, Yingqian and Chen, Jie and Chen, Zhipeng and Zhao, Wayne Xin and Fang, Lei and Wen, Ji-Rong},
  journal = {arXiv preprint arXiv:2503.05592},
  year    = {2025}
}

@article{zheng2025deepresearcher,
  title   = {Deepresearcher: Scaling deep research via reinforcement learning in real-world environments},
  author  = {Zheng, Yuxiang and Fu, Dayuan and Hu, Xiangkun and Cai, Xiaojie and Ye, Lyumanshan and Lu, Pengrui and Liu, Pengfei},
  journal = {arXiv preprint arXiv:2504.03160},
  year    = {2025}
}

@article{luo2025gui,
  title   = {Gui-r1: A generalist r1-style vision-language action model for gui agents},
  author  = {Luo, Run and Wang, Lu and He, Wanwei and Xia, Xiaobo},
  journal = {arXiv preprint arXiv:2504.10458},
  year    = {2025}
}

@article{li2025webthinker,
  title   = {Webthinker: Empowering large reasoning models with deep research capability},
  author  = {Li, Xiaoxi and Jin, Jiajie and Dong, Guanting and Qian, Hongjin and Zhu, Yutao and Wu, Yongkang and Wen, Ji-Rong and Dou, Zhicheng},
  journal = {arXiv preprint arXiv:2504.21776},
  year    = {2025}
}

@article{shi2025search,
  title   = {Search and Refine During Think: Autonomous Retrieval-Augmented Reasoning of LLMs},
  author  = {Shi, Yaorui and Li, Sihang and Wu, Chang and Liu, Zhiyuan and Fang, Junfeng and Cai, Hengxing and Zhang, An and Wang, Xiang},
  journal = {arXiv preprint arXiv:2505.11277},
  year    = {2025}
}

@article{jin2025empirical,
  title   = {An Empirical Study on Reinforcement Learning for Reasoning-Search Interleaved LLM Agents},
  author  = {Jin, Bowen and Yoon, Jinsung and Kargupta, Priyanka and Arik, Sercan O and Han, Jiawei},
  journal = {arXiv preprint arXiv:2505.15117},
  year    = {2025}
}

@article{chen2025visrl,
  title={Visrl: Intention-driven visual perception via reinforced reasoning},
  author={Chen, Zhangquan and Luo, Xufang and Li, Dongsheng},
  journal={arXiv preprint arXiv:2503.07523},
  year={2025}
}

@article{chen2025sifthinker,
  title={SIFThinker: Spatially-Aware Image Focus for Visual Reasoning},
  author={Chen, Zhangquan and Zhao, Ruihui and Luo, Chuwei and Sun, Mingze and Yu, Xinlei and Kang, Yangyang and Huang, Ruqi},
  journal={arXiv preprint arXiv:2508.06259},
  year={2025}
}

@article{lu2025arpo,
  title   = {ARPO: End-to-End Policy Optimization for GUI Agents with Experience Replay},
  author  = {Lu, Fanbin and Zhong, Zhisheng and Liu, Shu and Fu, Chi-Wing and Jia, Jiaya},
  journal = {arXiv preprint arXiv:2505.16282},
  year    = {2025}
}

@article{wu2025webdancer,
  title   = {WebDancer: Towards Autonomous Information Seeking Agency},
  author  = {Wu, Jialong and Li, Baixuan and Fang, Runnan and Yin, Wenbiao and Zhang, Liwen and Tao, Zhengwei and Zhang, Dingchu and Xi, Zekun and Fu, Gang and Jiang, Yong and others},
  journal = {arXiv preprint arXiv:2505.22648},
  year    = {2025}
}

@article{zhi2025medgr2,
  title   = {MedGR²: Breaking the Data Barrier for Medical Reasoning via Generative
 Reward Learning},
  author  = {Weihai Zhi and Jiayan Guo and Shangyang Li},
  journal = {arXiv preprint arXiv:2508.20549},
  year    = {2025}
}


@article{2025baichuan,
  title   = {Baichuan-M1: Pushing the Medical Capability of Large Language Models},
  author  = {Baichuan Inc.},
  journal = {arXiv preprint arXiv::2502.12671},
  year    = {2025}
}

@article{2025baichuan2,
  title   = {Baichuan-M2: Scaling Medical Capability with Large Verifier System},
  author  = {Baichuan Inc.},
  journal = {arXiv preprint arXiv::2509.02208},
  year    = {2025}
}
## Applications - Multimodal Tasks

### Image Understanding


### Video and 3D Understanding


### Multimodal Generation

@inproceedings{rombach2022high,
  title     = {High-resolution image synthesis with latent diffusion models},
  author    = {Rombach, Robin and Blattmann, Andreas and Lorenz, Dominik and Esser, Patrick and Ommer, Bj{\"o}rn},
  booktitle = {Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
  pages     = {10684--10695},
  year      = {2022}
}

@inproceedings{liuflow,
  title     = {Flow Straight and Fast: Learning to Generate and Transfer Data with Rectified Flow},
  author    = {Liu, Xingchao and Gong, Chengyue and others},
  booktitle = {The Eleventh International Conference on Learning Representations},
  year      = {2023}
}

@inproceedings{guo2025can,
  title     = {Can We Generate Images with CoT? Let's Verify and Reinforce Image Generation Step by Step},
  author    = {Guo, Ziyu and Zhang, Renrui and Tong, Chengzhuo and Zhao, Zhizheng and Gao, Peng and Li, Hongsheng and Heng, Pheng-Ann},
  booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  year      = {2025}
}

@article{wang2025simplear,
  title   = {Simplear: Pushing the frontier of autoregressive visual generation through pretraining, sft, and rl},
  author  = {Wang, Junke and Tian, Zhi and Wang, Xun and Zhang, Xinyu and Huang, Weilin and Wu, Zuxuan and Jiang, Yu-Gang},
  journal = {arXiv preprint arXiv:2504.11455},
  year    = {2025}
}

@article{pan2025focusdiff,
  title   = {FocusDiff: Advancing Fine-Grained Text-Image Alignment for Autoregressive Visual Generation through RL},
  author  = {Pan, Kaihang and Bu, Wendong and Wu, Yuruo and Wu, Yang and Shen, Kai and Li, Yunfei and Zhao, Hang and Li, Juncheng and Tang, Siliang and Zhuang, Yueting},
  journal = {arXiv preprint arXiv:2506.05501},
  year    = {2025}
}

@article{wu2025reprompt,
  title   = {RePrompt: Reasoning-Augmented Reprompting for Text-to-Image Generation via Reinforcement Learning},
  author  = {Wu, Mingrui and Wang, Lu and Zhao, Pu and Yang, Fangkai and Zhang, Jianjin and Liu, Jianfeng and Zhan, Yuefeng and Han, Weihao and Sun, Hao and Ji, Jiayi and others},
  journal = {arXiv preprint arXiv:2505.17540},
  year    = {2025}
}

@article{jiang2025t2i,
  title   = {T2i-r1: Reinforcing image generation with collaborative semantic-level and token-level cot},
  author  = {Jiang, Dongzhi and Guo, Ziyu and Zhang, Renrui and Zong, Zhuofan and Li, Hao and Zhuo, Le and Yan, Shilin and Heng, Pheng-Ann and Li, Hongsheng},
  journal = {arXiv preprint arXiv:2505.00703},
  year    = {2025}
}

@article{duan2025got,
  title   = {GOT-R1: Unleashing reasoning capability of mllm for visual generation with reinforcement learning},
  author  = {Duan, Chengqi and Fang, Rongyao and Wang, Yuqing and Wang, Kun and Huang, Linjiang and Zeng, Xingyu and Li, Hongsheng and Liu, Xihui},
  journal = {arXiv preprint arXiv:2505.17022},
  year    = {2025}
}

@article{jiang2025co,
  title   = {Co-Reinforcement Learning for Unified Multimodal Understanding and Generation},
  author  = {Jiang, Jingjing and Si, Chongjie and Luo, Jun and Zhang, Hanwang and Ma, Chao},
  journal = {arXiv preprint arXiv:2505.17534},
  year    = {2025}
}

@article{hong2025reinforcing,
  title   = {Reinforcing Multimodal Understanding and Generation with Dual Self-rewards},
  author  = {Hong, Jixiang and Zhang, Yiran and Wang, Guanzhong and Liu, Yi and Wen, Ji-Rong and Yan, Rui},
  journal = {arXiv preprint arXiv:2506.07963},
  year    = {2025}
}

@article{zhang2025reasongen,
  title   = {ReasonGen-R1: CoT for Autoregressive Image generation models through SFT and RL},
  author  = {Zhang, Yu and Li, Yunqi and Yang, Yifan and Wang, Rui and Yang, Yuqing and Qi, Dai and Bao, Jianmin and Chen, Dongdong and Luo, Chong and Qiu, Lili},
  journal = {arXiv preprint arXiv:2505.24875},
  year    = {2025}
}

@article{pan2025self,
  title   = {Self-Reflective Reinforcement Learning for Diffusion-based Image Reasoning Generation},
  author  = {Pan, Jiadong and Ma, Zhiyuan and Zhang, Kaiyan and Ding, Ning and Zhou, Bowen},
  journal = {arXiv preprint arXiv:2505.22407},
  year    = {2025}
}

@article{liu2025flow,
  title   = {Flow-grpo: Training flow matching models via online rl},
  author  = {Liu, Jie and Liu, Gongye and Liang, Jiajun and Li, Yangguang and Liu, Jiaheng and Wang, Xintao and Wan, Pengfei and Zhang, Di and Ouyang, Wanli},
  journal = {arXiv preprint arXiv:2505.05470},
  year    = {2025}
}

@article{xue2025dancegrpo,
  title   = {DanceGRPO: Unleashing GRPO on Visual Generation},
  author  = {Xue, Zeyue and Wu, Jie and Gao, Yu and Kong, Fangyuan and Zhu, Lingting and Chen, Mengzhao and Liu, Zhiheng and Liu, Wei and Guo, Qiushan and Huang, Weilin and others},
  journal = {arXiv preprint arXiv:2505.07818},
  year    = {2025}
}

@article{wu2025qwen,
  title   = {Qwen-Image Technical Report},
  author  = {Wu, Chenfei and Li, Jiahao and Zhou, Jingren and Lin, Junyang and Gao, Kaiyuan and Yan, Kun and Yin, Sheng-ming and Bai, Shuai and Xu, Xiao and Chen, Yilei and others},
  journal = {arXiv preprint arXiv:2508.02324},
  year    = {2025}
}

@article{he2025tempflow,
  title   = {TempFlow-GRPO: When Timing Matters for GRPO in Flow Models},
  author  = {He, Xiaoxuan and Fu, Siming and Zhao, Yuke and Li, Wanli and Yang, Jian and Yin, Dacheng and Rao, Fengyun and Zhang, Bo},
  journal = {arXiv preprint arXiv:2508.04324},
  year    = {2025}
}

@article{li2025mixgrpo,
  title   = {MixGRPO: Unlocking Flow-based GRPO Efficiency with Mixed ODE-SDE},
  author  = {Li, Junzhe and Cui, Yutao and Huang, Tao and Ma, Yinping and Fan, Chun and Yang, Miles and Zhong, Zhao},
  journal = {arXiv preprint arXiv:2507.21802},
  year    = {2025}
}

@misc{openai2024gpt4oimage,
  author       = {OpenAI},
  title        = {Introducing GPT-4o Image Generation},
  year         = {2024},
  howpublished = {\url{https://openai.com/index/introducing-4o-image-generation/}},
  note         = {Accessed: 2025-08-25}
}

@inproceedings{wallace2024diffusion,
  title     = {Diffusion model alignment using direct preference optimization},
  author    = {Wallace, Bram and Dang, Meihua and Rafailov, Rafael and Zhou, Linqi and Lou, Aaron and Purushwalkam, Senthil and Ermon, Stefano and Xiong, Caiming and Joty, Shafiq and Naik, Nikhil},
  booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages     = {8228--8238},
  year      = {2024}
}

@inproceedings{liang2025aesthetic,
  title     = {Aesthetic post-training diffusion models from generic preferences with step-by-step preference optimization},
  author    = {Liang, Zhanhao and Yuan, Yuhui and Gu, Shuyang and Chen, Bohan and Hang, Tiankai and Cheng, Mingxi and Li, Ji and Zheng, Liang},
  booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference},
  pages     = {13199--13208},
  year      = {2025}
}

@article{tong2025delving,
  title   = {Delving into RL for Image Generation with CoT: A Study on DPO vs. GRPO},
  author  = {Tong, Chengzhuo and Guo, Ziyu and Zhang, Renrui and Shan, Wenyu and Wei, Xinyu and Xing, Zhenghao and Li, Hongsheng and Heng, Pheng-Ann},
  journal = {arXiv preprint arXiv:2505.17017},
  year    = {2025}
}

@inproceedings{liu2025videodpo,
  title     = {Videodpo: Omni-preference alignment for video diffusion generation},
  author    = {Liu, Runtao and Wu, Haoyu and Zheng, Ziqiang and Wei, Chen and He, Yingqing and Pi, Renjie and Chen, Qifeng},
  booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference},
  pages     = {8009--8019},
  year      = {2025}
}

@inproceedings{liu2025video,
  title     = {Video-t1: Test-time scaling for video generation},
  author    = {Liu, Fangfu and Wang, Hanyang and Cai, Yimo and Zhang, Kaiyan and Zhan, Xiaohang and Duan, Yueqi},
  booktitle = {Proceedings of the IEEE/CVF international conference on computer vision},
  year      = {2025}
}

@article{ma2025inference,
  title   = {Inference-time scaling for diffusion models beyond scaling denoising steps},
  author  = {Ma, Nanye and Tong, Shangyuan and Jia, Haolin and Hu, Hexiang and Su, Yu-Chuan and Zhang, Mingda and Yang, Xuan and Li, Yandong and Jaakkola, Tommi and Jia, Xuhui and others},
  journal = {arXiv preprint arXiv:2501.09732},
  year    = {2025}
}

@inproceedings{singhalgeneral,
  title     = {A General Framework for Inference-time Scaling and Steering of Diffusion Models},
  author    = {Singhal, Raghav and Horvitz, Zachary and Teehan, Ryan and Ren, Mengye and Yu, Zhou and McKeown, Kathleen and Ranganath, Rajesh},
  booktitle = {Forty-second International Conference on Machine Learning},
  year      = {2025}
}

@inproceedings{esser2024scaling,
  title     = {Scaling rectified flow transformers for high-resolution image synthesis},
  author    = {Esser, Patrick and Kulal, Sumith and Blattmann, Andreas and Entezari, Rahim and M{\"u}ller, Jonas and Saini, Harry and Levi, Yam and Lorenz, Dominik and Sauer, Axel and Boesel, Frederic and others},
  booktitle = {Forty-first international conference on machine learning},
  year      = {2024}
}

@article{lin2025reasoning,
  title   = {Reasoning physical video generation with diffusion timestep tokens via reinforcement learning},
  author  = {Lin, Wang and Jia, Liyu and Hu, Wentao and Pan, Kaihang and Yue, Zhongqi and Zhao, Wei and Chen, Jingyuan and Wu, Fei and Zhang, Hanwang},
  journal = {arXiv preprint arXiv:2504.15932},
  year    = {2025}
}

@article{fang2025inflvg,
  title   = {InfLVG: Reinforce Inference-Time Consistent Long Video Generation with GRPO},
  author  = {Fang, Xueji and Ma, Liyuan and Chen, Zhiyang and Zhou, Mingyuan and Qi, Guo-jun},
  journal = {arXiv preprint arXiv:2505.17574},
  year    = {2025}
}

@article{kong2024hunyuanvideo,
  title   = {Hunyuanvideo: A systematic framework for large video generative models},
  author  = {Kong, Weijie and Tian, Qi and Zhang, Zijian and Min, Rox and Dai, Zuozhuo and Zhou, Jin and Xiong, Jiangfeng and Li, Xin and Wu, Bo and Zhang, Jianwei and others},
  journal = {arXiv preprint arXiv:2412.03603},
  year    = {2024}
}

@article{liu2025improving,
  title   = {Improving video generation with human feedback},
  author  = {Liu, Jie and Liu, Gongye and Liang, Jiajun and Yuan, Ziyang and Liu, Xiaokun and Zheng, Mingwu and Wu, Xiele and Wang, Qiulin and Qin, Wenyu and Xia, Menghan and others},
  journal = {arXiv preprint arXiv:2501.13918},
  year    = {2025}
}

@article{schick2023toolformer,
  title   = {Toolformer: Language models can teach themselves to use tools},
  author  = {Schick, Timo and Dwivedi-Yu, Jane and Dess{\`\i}, Roberto and Raileanu, Roberta and Lomeli, Maria and Hambro, Eric and Zettlemoyer, Luke and Cancedda, Nicola and Scialom, Thomas},
  journal = {Advances in Neural Information Processing Systems},
  volume  = {36},
  pages   = {68539--68551},
  year    = {2023}
}

@article{sun2025zerosearch,
  title   = {Zerosearch: Incentivize the search capability of llms without searching},
  author  = {Sun, Hao and Qiao, Zile and Guo, Jiayan and Fan, Xuanbo and Hou, Yingyan and Jiang, Yong and Xie, Pengjun and Zhang, Yan and Huang, Fei and Zhou, Jingren},
  journal = {arXiv preprint arXiv:2505.04588},
  year    = {2025}
}

@article{wei2025browsecomp,
  title   = {Browsecomp: A simple yet challenging benchmark for browsing agents},
  author  = {Wei, Jason and Sun, Zhiqing and Papay, Spencer and McKinney, Scott and Han, Jeffrey and Fulford, Isa and Chung, Hyung Won and Passos, Alex Tachard and Fedus, William and Glaese, Amelia},
  journal = {arXiv preprint arXiv:2504.12516},
  year    = {2025}
}

@inproceedings{mialon2023gaia,
  title     = {Gaia: a benchmark for general ai assistants},
  author    = {Mialon, Gr{\'e}goire and Fourrier, Cl{\'e}mentine and Wolf, Thomas and LeCun, Yann and Scialom, Thomas},
  booktitle = {The Twelfth International Conference on Learning Representations},
  year      = {2023}
}

@article{li2025websailor,
  title   = {WebSailor: Navigating Super-human Reasoning for Web Agent},
  author  = {Li, Kuan and Zhang, Zhongwang and Yin, Huifeng and Zhang, Liwen and Ou, Litu and Wu, Jialong and Yin, Wenbiao and Li, Baixuan and Tao, Zhengwei and Wang, Xinyu and others},
  journal = {arXiv preprint arXiv:2507.02592},
  year    = {2025}
}

@article{tao2025webshaper,
  title   = {WebShaper: Agentically Data Synthesizing via Information-Seeking Formalization},
  author  = {Tao, Zhengwei and Wu, Jialong and Yin, Wenbiao and Zhang, Junkai and Li, Baixuan and Shen, Haiyang and Li, Kuan and Zhang, Liwen and Wang, Xinyu and Jiang, Yong and others},
  journal = {arXiv preprint arXiv:2507.15061},
  year    = {2025}
}

@article{lu2025ui,
  title   = {UI-R1: Enhancing Efficient Action Prediction of GUI Agents by Reinforcement Learning},
  author  = {Lu, Zhengxi and Chai, Yuxiang and Guo, Yaxuan and Yin, Xi and Liu, Liang and Wang, Hao and Xiao, Han and Ren, Shuai and Xiong, Guanjing and Li, Hongsheng},
  journal = {arXiv preprint arXiv:2503.21620},
  year    = {2025}
}

@article{wang2025opencua,
  title   = {Opencua: Open foundations for computer-use agents},
  author  = {Wang, Xinyuan and Wang, Bowen and Lu, Dunjie and Yang, Junlin and Xie, Tianbao and Wang, Junli and Deng, Jiaqi and Guo, Xiaole and Xu, Yiheng and Wu, Chen Henry and others},
  journal = {arXiv preprint arXiv:2508.09123},
  year    = {2025}
}

@article{zhou2025gui,
  title   = {Gui-g1: Understanding r1-zero-like training for visual grounding in gui agents},
  author  = {Zhou, Yuqi and Dai, Sunhao and Wang, Shuai and Zhou, Kaiwen and Jia, Qinglin and Xu, Jun},
  journal = {arXiv preprint arXiv:2505.15810},
  year    = {2025}
}

@article{liu2025infigui,
  title   = {Infigui-r1: Advancing multimodal gui agents from reactive actors to deliberative reasoners},
  author  = {Liu, Yuhang and Li, Pengxiang and Xie, Congkai and Hu, Xavier and Han, Xiaotian and Zhang, Shengyu and Yang, Hongxia and Wu, Fei},
  journal = {arXiv preprint arXiv:2504.14239},
  year    = {2025}
}

@article{guo2025deepseek,
  title   = {Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning},
  author  = {Guo, Daya and Yang, Dejian and Zhang, Haowei and Song, Junxiao and Zhang, Ruoyu and Xu, Runxin and Zhu, Qihao and Ma, Shirong and Wang, Peiyi and Bi, Xiao and others},
  journal = {arXiv preprint arXiv:2501.12948},
  year    = {2025}
}

@article{qin2024o1,
  title   = {O1 Replication Journey: A Strategic Progress Report--Part 1},
  author  = {Qin, Yiwei and Li, Xuefeng and Zou, Haoyang and Liu, Yixiu and Xia, Shijie and Huang, Zhen and Ye, Yixin and Yuan, Weizhe and Liu, Hector and Li, Yuanzhi and others},
  journal = {arXiv preprint arXiv:2410.18982},
  year    = {2024}
}


@inproceedings{yang2025thinking-in-space,
  title     = {Thinking in space: How multimodal large language models see, remember, and recall spaces},
  author    = {Yang, Jihan and Yang, Shusheng and Gupta, Anjali W and Han, Rilyn and Fei-Fei, Li and Xie, Saining},
  booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference},
  pages     = {10632--10643},
  year      = {2025}
}

@article{wu2025spatial-mllm,
  title   = {Spatial-mllm: Boosting mllm capabilities in visual-based spatial intelligence},
  author  = {Wu, Diankun and Liu, Fangfu and Hung, Yi-Hsin and Duan, Yueqi},
  journal = {arXiv preprint arXiv:2505.23747},
  year    = {2025}
}

@article{pan2025metaspatial,
  title   = {Metaspatial: Reinforcing 3d spatial reasoning in vlms for the metaverse},
  author  = {Pan, Zhenyu and Liu, Han},
  journal = {arXiv preprint arXiv:2503.18470},
  year    = {2025}
}

@article{shao2024deepseekmath,
  title   = {Deepseekmath: Pushing the limits of mathematical reasoning in open language models},
  author  = {Shao, Zhihong and Wang, Peiyi and Zhu, Qihao and Xu, Runxin and Song, Junxiao and Bi, Xiao and Zhang, Haowei and Zhang, Mingchuan and Li, YK and Wu, Yang and others},
  journal = {arXiv preprint arXiv:2402.03300},
  year    = {2024}
}

@article{ouyang2025spaceR,
  title   = {SpaceR: Reinforcing MLLMs in Video Spatial Reasoning},
  author  = {Ouyang, Kun and Liu, Yuanxin and Wu, Haoning and Liu, Yi and Zhou, Hao and Zhou, Jie and Meng, Fandong and Sun, Xu},
  journal = {arXiv preprint arXiv:2504.01805},
  year    = {2025}
}

@article{zhou2025roborefer,
  title   = {RoboRefer: Towards Spatial Referring with Reasoning in Vision-Language Models for Robotics},
  author  = {Zhou, Enshen and An, Jingkun and Chi, Cheng and Han, Yi and Rong, Shanyu and Zhang, Chi and Wang, Pengwei and Wang, Zhongyuan and Huang, Tiejun and Sheng, Lu and others},
  journal = {arXiv preprint arXiv:2506.04308},
  year    = {2025}
}

@article{liao2025improved-visual-spatial,
  title   = {Improved visual-spatial reasoning via r1-zero-like training},
  author  = {Liao, Zhenyi and Xie, Qingsong and Zhang, Yanhao and Kong, Zijian and Lu, Haonan and Yang, Zhenyu and Deng, Zhijie},
  journal = {arXiv preprint arXiv:2504.00883},
  year    = {2025}
}

@article{guan2025rstar,
  title   = {rStar-Math: Small LLMs Can Master Math Reasoning with Self-Evolved Deep Thinking},
  author  = {Guan, Xinyu and Zhang, Li Lyna and Liu, Yifei and Shang, Ning and Sun, Youran and Zhu, Yi and Yang, Fan and Yang, Mao},
  journal = {arXiv preprint arXiv:2501.04519},
  year    = {2025}
}

@article{luo2024improve,
  title   = {Improve mathematical reasoning in language models by automated process supervision},
  author  = {Luo, Liangchen and Liu, Yinxiao and Liu, Rosanne and Phatale, Samrat and Guo, Meiqi and Lara, Harsh and Li, Yunxuan and Shu, Lei and Zhu, Yun and Meng, Lei and others},
  journal = {arXiv preprint arXiv:2406.06592},
  year    = {2024}
}

@article{wu2025totrl,
  title   = {ToTRL: Unlock LLM Tree-of-Thoughts Reasoning Potential through Puzzles Solving},
  author  = {Wu, Haoyuan and Chen, Xueyi and Ming, Rui and Gao, Jilong and Hu, Shoubo and He, Zhuolun and Yu, Bei},
  journal = {arXiv preprint arXiv:2505.12717},
  year    = {2025}
}


@article{ji2025difficulty,
  title   = {How Difficulty-Aware Staged Reinforcement Learning Enhances LLMs' Reasoning Capabilities: A Preliminary Experimental Study},
  author  = {Ji, Yunjie and Zhao, Sitong and Tian, Xiaoyu and Wang, Haotian and Chen, Shuaiting and Peng, Yiping and Zhao, Han and Li, Xiangang},
  journal = {arXiv preprint arXiv:2504.00829},
  year    = {2025}
}

@article{team2025kimi,
  title   = {Kimi k1. 5: Scaling reinforcement learning with llms},
  author  = {Team, Kimi},
  journal = {arXiv preprint arXiv:2501.12599},
  year    = {2025}
}

@article{chen2025self,
  title   = {Self-Evolving Curriculum for LLM Reasoning},
  author  = {Chen, Xiaoyin and Lu, Jiarui and Kim, Minsu and Zhang, Dinghuai and Tang, Jian and Pich{\'e}, Alexandre and Gontier, Nicolas and Bengio, Yoshua and Kamalloo, Ehsan},
  journal = {arXiv preprint arXiv:2505.14970},
  year    = {2025}
}

@article{parashar2025curriculum,
  title   = {Curriculum Reinforcement Learning from Easy to Hard Tasks Improves LLM Reasoning},
  author  = {Parashar, Shubham and Gui, Shurui and Li, Xiner and Ling, Hongyi and Vemuri, Sushil and Olson, Blake and Li, Eric and Zhang, Yu and Caverlee, James and Kalathil, Dileep and others},
  journal = {arXiv preprint arXiv:2506.06632},
  year    = {2025}
}
@article{zhang2025speed,
  title   = {SPEED-RL: Faster Training of Reasoning Models via Online Curriculum Learning},
  author  = {Zhang, Ruiqi and Arora, Daman and Mei, Song and Zanette, Andrea},
  journal = {arXiv preprint arXiv:2506.09016},
  year    = {2025}
}

@article{zheng2025act,
  title   = {Act Only When It Pays: Efficient Reinforcement Learning for LLM Reasoning via Selective Rollouts},
  author  = {Zheng, Haizhong and Zhou, Yang and Bartoldson, Brian R and Kailkhura, Bhavya and Lai, Fan and Zhao, Jiawei and Chen, Beidi},
  journal = {arXiv preprint arXiv:2506.02177},
  year    = {2025}
}
@article{sun2025improving,
  title   = {Improving Data Efficiency for LLM Reinforcement Fine-tuning Through Difficulty-targeted Online Data Selection and Rollout Replay},
  author  = {Sun, Yifan and Shen, Jingyan and Wang, Yibin and Chen, Tianyu and Wang, Zhendong and Zhou, Mingyuan and Zhang, Huan},
  journal = {arXiv preprint arXiv:2506.05316},
  year    = {2025}
}
@article{shi2025efficient,
  title   = {Efficient reinforcement finetuning via adaptive curriculum learning},
  author  = {Shi, Taiwei and Wu, Yiyang and Song, Linxin and Zhou, Tianyi and Zhao, Jieyu},
  journal = {arXiv preprint arXiv:2504.05520},
  year    = {2025}
}
@article{do2025sparft,
  title   = {SPaRFT: Self-Paced Reinforcement Fine-Tuning for Large Language Models},
  author  = {Do, Dai and Nguyen, Manh and Venkatesh, Svetha and Le, Hung},
  journal = {arXiv preprint arXiv:2508.05015},
  year    = {2025}
}

@article{hu2024openrlhf,
  title   = {Openrlhf: An easy-to-use, scalable and high-performance rlhf framework},
  author  = {Hu, Jian and Wu, Xibin and Zhu, Zilin and Wang, Weixun and Zhang, Dehao and Cao, Yu and others},
  journal = {arXiv preprint arXiv:2405.11143},
  year    = {2024}
}

@inproceedings{sheng2025hybridflow,
  title     = {Hybridflow: A flexible and efficient rlhf framework},
  author    = {Sheng, Guangming and Zhang, Chi and Ye, Zilingfeng and Wu, Xibin and Zhang, Wang and Zhang, Ru and Peng, Yanghua and Lin, Haibin and Wu, Chuan},
  booktitle = {Proceedings of the Twentieth European Conference on Computer Systems},
  pages     = {1279--1297},
  year      = {2025}
}

@article{wang2025reinforcement-roll,
  title   = {Reinforcement Learning Optimization for Large-Scale Learning: An Efficient and User-Friendly Scaling Library},
  author  = {Wang, Weixun and Xiong, Shaopan and Chen, Gengru and Gao, Wei and Guo, Sheng and He, Yancheng and Huang, Ju and Liu, Jiaheng and Li, Zhendong and Li, Xiaoyang and others},
  journal = {arXiv preprint arXiv:2506.06122},
  year    = {2025}
}

@misc{vonwerra2022trl,
  author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
  title        = {TRL: Transformer Reinforcement Learning},
  year         = {2020},
  publisher    = {GitHub},
  journal      = {GitHub repository},
  howpublished = {\url{https://github.com/huggingface/trl}}
}

@misc{nemo-rl,
  title        = {NeMo RL: A Scalable and Efficient Post-Training Library},
  author       = {NVIDIA-NeMo},
  howpublished = {\url{https://github.com/NVIDIA-NeMo/RL}},
  year         = {2025},
  note         = {GitHub repository}
}

@misc{THUDM-slime,
  title        = {slime: An SGLang-Native Post-Training Framework for RL Scaling},
  author       = {THUDM},
  howpublished = {\url{https://github.com/THUDM/slime}},
  year         = {2025},
  note         = {GitHub repository}
}

@inproceedings{zheng2024llamafactory,
  title     = {LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models},
  author    = {Yaowei Zheng and Richong Zhang and Junhao Zhang and Yanhan Ye and Zheyan Luo and Zhangchi Feng and Yongqiang Ma},
  booktitle = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)},
  address   = {Bangkok, Thailand},
  publisher = {Association for Computational Linguistics},
  year      = {2024},
  url       = {http://arxiv.org/abs/2403.13372}
}

@article{zhang2025design,
  title   = {On the Design of KL-Regularized Policy Gradient Algorithms for LLM Reasoning},
  author  = {Zhang, Yifan and Liu, Yifeng and Yuan, Huizhuo and Yuan, Yang and Gu, Quanquan and Yao, Andrew C},
  journal = {arXiv preprint arXiv:2505.17508},
  year    = {2025}
}

@inproceedings{ziebart2008maximum,
  title        = {Maximum entropy inverse reinforcement learning.},
  author       = {Ziebart, Brian D and Maas, Andrew L and Bagnell, J Andrew and Dey, Anind K and others},
  booktitle    = {Aaai},
  volume       = {8},
  pages        = {1433--1438},
  year         = {2008},
  organization = {Chicago, IL, USA}
}

@misc{yao2025offpolicy,
  title   = {Your Efficient RL Framework Secretly Brings You Off-Policy RL Training},
  url     = {https://fengyao.notion.site/off-policy-rl},
  author  = {Yao, Feng and Liu, Liyuan and Zhang, Dinghuai and Dong, Chengyu and Shang, Jingbo and Gao, Jianfeng},
  journal = {Feng Yao's Notion},
  year    = {2025},
  month   = aug
}

@misc{yao2025flashrl,
  title   = {FlashRL: 8Bit Rollouts, Full Power RL},
  url     = {https://fengyao.notion.site/flash-rl},
  author  = {Liu, Liyuan and Yao, Feng and Zhang, Dinghuai and Dong, Chengyu and Shang, Jingbo and Gao, Jianfeng},
  journal = {Feng Yao's Notion},
  year    = {2025},
  month   = aug
}

@article{faceopen,
  title   = {Open r1: A fully open reproduction of deepseek-r1, January 2025},
  author  = {Face, Hugging},
  journal = {URL https://github. com/huggingface/open-r1},
  year    = {2025},
  pages   = {9}
}

@article{chen2025acereason,
  title   = {Acereason-nemotron: Advancing math and code reasoning through reinforcement learning},
  author  = {Chen, Yang and Yang, Zhuolin and Liu, Zihan and Lee, Chankyu and Xu, Peng and Shoeybi, Mohammad and Catanzaro, Bryan and Ping, Wei},
  journal = {arXiv preprint arXiv:2505.16400},
  year    = {2025}
}

@article{pourreza2025reasoning,
  title   = {Reasoning-sql: Reinforcement learning with sql tailored partial rewards for reasoning-enhanced text-to-sql},
  author  = {Pourreza, Mohammadreza and Talaei, Shayan and Sun, Ruoxi and Wan, Xingchen and Li, Hailong and Mirhoseini, Azalia and Saberi, Amin and Arik, Sercan and others},
  journal = {arXiv preprint arXiv:2503.23157},
  year    = {2025}
}

@article{dai2025reex,
  title   = {ReEx-SQL: Reasoning with Execution-Aware Reinforcement Learning for Text-to-SQL},
  author  = {Dai, Yaxun and Xie, Wenxuan and Zhuang, Xialie and Yang, Tianyu and Yang, Yiying and Yang, Haiqin and Zhao, Yuhang and Chao, Pingfu and Jiang, Wenhao},
  journal = {arXiv preprint arXiv:2505.12768},
  year    = {2025}
}

@article{gajjar2025cognisql,
  title   = {CogniSQL-R1-Zero: Lightweight Reinforced Reasoning for Efficient SQL Generation},
  author  = {Gajjar, Kushal and Sikchi, Harshit and Gautam, Arpit Singh and Hammons, Marc and Jha, Saurabh},
  journal = {arXiv preprint arXiv:2507.06013},
  year    = {2025}
}

@article{wang2504kimina,
  title   = {Kimina-prover preview: Towards large formal reasoning models with reinforcement learning, 2025},
  author  = {Wang, Haiming and Unsal, Mert and Lin, Xiaohan and Baksys, Mantas and Liu, Junqi and Santos, MD and Sung, Flood and Vinyes, Marina and Ying, Zhenzhe and Zhu, Zekai and others},
  year    = {2025},
  journal = {URL https://arxiv. org/abs/2504.11354}
}

@article{ren2025deepseek,
  title   = {Deepseek-prover-v2: Advancing formal mathematical reasoning via reinforcement learning for subgoal decomposition},
  author  = {Ren, ZZ and Shao, Zhihong and Song, Junxiao and Xin, Huajian and Wang, Haocheng and Zhao, Wanjia and Zhang, Liyue and Fu, Zhe and Zhu, Qihao and Yang, Dejian and others},
  journal = {arXiv preprint arXiv:2504.21801},
  year    = {2025}
}

@article{shang2025stepfun,
  title   = {StepFun-Prover Preview: Let's Think and Verify Step by Step},
  author  = {Shang, Shijie and Wan, Ruosi and Peng, Yue and Wu, Yutong and Chen, Xiong-hui and Yan, Jie and Zhang, Xiangyu},
  journal = {arXiv preprint arXiv:2507.20199},
  year    = {2025}
}

@article{ji2025leanabell,
  title   = {Leanabell-prover-v2: Verifier-integrated reasoning for formal theorem proving via reinforcement learning},
  author  = {Ji, Xingguang and Liu, Yahui and Wang, Qi and Zhang, Jingyuan and Yue, Yang and Shi, Rui and Sun, Chenxi and Zhang, Fuzheng and Zhou, Guorui and Gai, Kun},
  journal = {arXiv preprint arXiv:2507.08649},
  year    = {2025}
}



@article{wang2025verireason,
  title   = {VeriReason: Reinforcement Learning with Testbench Feedback for Reasoning-Enhanced Verilog Generation},
  author  = {Wang, Yiting and Sun, Guoheng and Ye, Wanghao and Qu, Gang and Li, Ang},
  journal = {arXiv preprint arXiv:2505.11849},
  year    = {2025}
}

@article{zhu2025codev,
  title   = {CodeV-R1: Reasoning-Enhanced Verilog Generation},
  author  = {Zhu, Yaoyu and Huang, Di and Lyu, Hanqi and Zhang, Xiaoyun and Li, Chongxiao and Shi, Wenxuan and Wu, Yutong and Mu, Jianan and Wang, Jinghua and Zhao, Yang and others},
  journal = {arXiv preprint arXiv:2505.24183},
  year    = {2025}
}

@article{wei2025swe,
  title   = {Swe-rl: Advancing llm reasoning via reinforcement learning on open software evolution},
  author  = {Wei, Yuxiang and Duchenne, Olivier and Copet, Jade and Carbonneaux, Quentin and Zhang, Lingming and Fried, Daniel and Synnaeve, Gabriel and Singh, Rishabh and Wang, Sida I},
  journal = {arXiv preprint arXiv:2502.18449},
  year    = {2025}
}

@article{zeng2025satori,
  title   = {Satori-SWE: Evolutionary Test-Time Scaling for Sample-Efficient Software Engineering},
  author  = {Zeng, Guangtao and Shen, Maohao and Chen, Delin and Qi, Zhenting and Das, Subhro and Gutfreund, Dan and Cox, David and Wornell, Gregory and Lu, Wei and Hong, Zhang-Wei and others},
  journal = {arXiv preprint arXiv:2505.23604},
  year    = {2025}
}

@article{fuster2025repaca,
  title   = {RePaCA: Leveraging Reasoning Large Language Models for Static Automated Patch Correctness Assessment},
  author  = {Fuster-Pena, Marcos and de-Fitero-Dominguez, David and Garcia-Cabot, Antonio and Garcia-Lopez, Eva},
  journal = {arXiv preprint arXiv:2507.22580},
  year    = {2025}
}

@article{hu2025repair,
  title   = {Repair-R1: Better Test Before Repair},
  author  = {Hu, Haichuan and Xie, Xiaochen and Zhang, Quanjun},
  journal = {arXiv preprint arXiv:2507.22853},
  year    = {2025}
}

@article{du2025afterburner,
  title   = {Afterburner: Reinforcement Learning Facilitates Self-Improving Code Efficiency Optimization},
  author  = {Du, Mingzhe and Tuan, Luu Anh and Liu, Yue and Qing, Yuhao and Huang, Dong and He, Xinyi and Liu, Qian and Ma, Zejun and Ng, See-kiong},
  journal = {arXiv preprint arXiv:2505.23387},
  year    = {2025}
}

@article{yao2025training,
  title   = {Training Language Models to Generate Quality Code with Program Analysis Feedback},
  author  = {Yao, Feng and Wang, Zilong and Liu, Liyuan and Cui, Junxia and Zhong, Li and Fu, Xiaohan and Mai, Haohui and Krishnan, Vish and Gao, Jianfeng and Shang, Jingbo},
  journal = {arXiv preprint arXiv:2505.22704},
  year    = {2025}
}

@article{huang20253d-r1,
  title   = {3D-R1: Enhancing Reasoning in 3D VLMs for Unified Scene Understanding},
  author  = {Huang, Ting and Zhang, Zeyu and Tang, Hao},
  journal = {arXiv preprint arXiv:2507.23478},
  year    = {2025}
}
@article{bengio2023gflownet,
  title   = {Gflownet foundations},
  author  = {Bengio, Yoshua and Lahlou, Salem and Deleu, Tristan and Hu, Edward J and Tiwari, Mo and Bengio, Emmanuel},
  journal = {Journal of Machine Learning Research},
  volume  = {24},
  number  = {210},
  pages   = {1--55},
  year    = {2023}
}
@article{hu2023amortizing,
  title   = {Amortizing intractable inference in large language models},
  author  = {Hu, Edward J and Jain, Moksh and Elmoznino, Eric and Kaddar, Younesse and Lajoie, Guillaume and Bengio, Yoshua and Malkin, Nikolay},
  journal = {arXiv preprint arXiv:2310.04363},
  year    = {2023}
}
@inproceedings{yuflow,
  title     = {Flow of Reasoning: Training LLMs for Divergent Reasoning with Minimal Examples},
  author    = {Yu, Fangxu and Jiang, Lai and Kang, Haoqiang and Hao, Shibo and Qin, Lianhui},
  booktitle = {Forty-second International Conference on Machine Learning}
}
@article{lee2024learning,
  title   = {Learning diverse attacks on large language models for robust red-teaming and safety tuning},
  author  = {Lee, Seanie and Kim, Minsu and Cherif, Lynn and Dobre, David and Lee, Juho and Hwang, Sung Ju and Kawaguchi, Kenji and Gidel, Gauthier and Bengio, Yoshua and Malkin, Nikolay and others},
  journal = {arXiv preprint arXiv:2405.18540},
  year    = {2024}
}

@article{jiang2025improving,
  title   = {Improving Generative Ad Text on Facebook using Reinforcement Learning},
  author  = {Jiang, Daniel R and Nikulkov, Alex and Chen, Yu-Chia and Bai, Yang and Zhu, Zheqing},
  journal = {arXiv preprint arXiv:2507.21983},
  year    = {2025}
}

@article{zhang2025shop,
  title   = {Shop-R1: Rewarding LLMs to Simulate Human Behavior in Online Shopping via Reinforcement Learning},
  author  = {Zhang, Yimeng and Wang, Tian and Gesi, Jiri and Wang, Ziyi and Lu, Yuxuan and Lin, Jiacheng and Zhan, Sinong and Gao, Vianne and Jiao, Ruochen and Liu, Junze and others},
  journal = {arXiv preprint arXiv:2507.17842},
  year    = {2025}
}

@article{oh2025laviplan,
  title   = {LaViPlan: Language-Guided Visual Path Planning with RLVR},
  author  = {Oh, Hayeon},
  journal = {arXiv preprint arXiv:2507.12911},
  year    = {2025}
}

@article{qiu2025opentable,
  title   = {OpenTable-R1: A Reinforcement Learning Augmented Tool Agent for Open-Domain Table Question Answering},
  author  = {Qiu, Zipeng},
  journal = {arXiv preprint arXiv:2507.03018},
  year    = {2025}
}

@article{han2025joyagents,
  title   = {JoyAgents-R1: Joint Evolution Dynamics for Versatile Multi-LLM Agents with Reinforcement Learning},
  author  = {Han, Ai and Hu, Junxing and Wei, Pu and Zhang, Zhiqian and Guo, Yuhang and Lu, Jiawei and Zhang, Zicheng},
  journal = {arXiv preprint arXiv:2506.19846},
  year    = {2025}
}

@article{li2025drive,
  title   = {Drive-R1: Bridging Reasoning and Planning in VLMs for Autonomous Driving with Reinforcement Learning},
  author  = {Li, Yue and Tian, Meng and Zhu, Dechang and Zhu, Jiangtong and Lin, Zhenyu and Xiong, Zhiwei and Zhao, Xinhai},
  journal = {arXiv preprint arXiv:2506.18234},
  year    = {2025}
}

@article{kang2025viki,
  title   = {Viki-r: Coordinating embodied multi-agent cooperation via reinforcement learning},
  author  = {Kang, Li and Song, Xiufeng and Zhou, Heng and Qin, Yiran and Yang, Jie and Liu, Xiaohong and Torr, Philip and Bai, Lei and Yin, Zhenfei},
  journal = {arXiv preprint arXiv:2506.09049},
  year    = {2025}
}

@article{xia2025mmedagent,
  title   = {MMedAgent-RL: Optimizing Multi-Agent Collaboration for Multimodal Medical Reasoning},
  author  = {Xia, Peng and Wang, Jinglu and Peng, Yibo and Zeng, Kaide and Wu, Xian and Tang, Xiangru and Zhu, Hongtu and Li, Yun and Liu, Shujie and Lu, Yan and others},
  journal = {arXiv preprint arXiv:2506.00555},
  year    = {2025}
}

@article{chen2025spc,
  title   = {Spc: Evolving self-play critic via adversarial games for llm reasoning},
  author  = {Chen, Jiaqi and Zhang, Bang and Ma, Ruotian and Wang, Peisong and Liang, Xiaodan and Tu, Zhaopeng and Li, Xiaolong and Wong, Kwan-Yee K},
  journal = {arXiv preprint arXiv:2504.19162},
  year    = {2025}
}

@article{ye2024scalable,
  title   = {Scalable Reinforcement Post-Training Beyond Static Human Prompts: Evolving Alignment via Asymmetric Self-Play},
  author  = {Ye, Ziyu and Agarwal, Rishabh and Liu, Tianqi and Joshi, Rishabh and Velury, Sarmishta and Le, Quoc V and Tan, Qijun and Liu, Yuan},
  journal = {arXiv preprint arXiv:2411.00062},
  year    = {2024}
}


@article{liu2025chasing,
  title   = {Chasing Moving Targets with Online Self-Play Reinforcement Learning for Safer Language Models},
  author  = {Liu, Mickel and Jiang, Liwei and Liang, Yancheng and Du, Simon Shaolei and Choi, Yejin and Althoff, Tim and Jaques, Natasha},
  journal = {arXiv preprint arXiv:2506.07468},
  year    = {2025}
}

@article{qian2025toolrl,
  title   = {Toolrl: Reward is all tool learning needs},
  author  = {Qian, Cheng and Acikgoz, Emre Can and He, Qi and Wang, Hongru and Chen, Xiusi and Hakkani-T{\"u}r, Dilek and Tur, Gokhan and Ji, Heng},
  journal = {arXiv preprint arXiv:2504.13958},
  year    = {2025}
}

@misc{kimi-researcher2025,
  title        = {Kimi-Researcher: End-to-End RL Training for Emerging Agentic Capabilities},
  author       = {Moonshot AI},
  year         = {2025},
  howpublished = {\url{https://moonshotai.github.io/Kimi-Researcher/}},
  note         = {Accessed: 2025-08-13}
}

@misc{phan2025humanitysexam,
  title         = {Humanity's Last Exam},
  author        = {Long Phan and Alice Gatti and Ziwen Han and Nathaniel Li and Josephina Hu and Hugh Zhang and et al},
  year          = {2025},
  eprint        = {2501.14249},
  archiveprefix = {arXiv},
  primaryclass  = {cs.LG},
  url           = {https://arxiv.org/abs/2501.14249}
}

@article{dao2025jan,
  title   = {Jan-nano Technical Report},
  author  = {Dao, Alan and Vu, Dinh Bach},
  journal = {arXiv preprint arXiv:2506.22760},
  year    = {2025}
}

@misc{2025mirothinker,
  title        = {MiroThinker: An open-source agentic model series trained for deep research and complex, long-horizon problem solving},
  author       = {MiroMind AI Team},
  howpublished = {\url{https://github.com/MiroMindAI/MiroThinker}},
  year         = {2025}
}

@article{sha2025sem,
  title   = {SEM: Reinforcement Learning for Search-Efficient Large Language Models},
  author  = {Sha, Zeyang and Cui, Shiwen and Wang, Weiqiang},
  journal = {arXiv preprint arXiv:2505.07903},
  year    = {2025}
}

@article{luo2025agent,
  title   = {Agent Lightning: Train ANY AI Agents with Reinforcement Learning},
  author  = {Luo, Xufang and Zhang, Yuge and He, Zhiyuan and Wang, Zilong and Zhao, Siyun and Li, Dongsheng and Qiu, Luna K and Yang, Yuqing},
  journal = {arXiv preprint arXiv:2508.03680},
  year    = {2025}
}

@article{mei20252,
  title   = {O2-Searcher: A Searching-based Agent Model for Open-Domain Open-Ended Question Answering},
  author  = {Mei, Jianbiao and Hu, Tao and Fu, Daocheng and Wen, Licheng and Yang, Xuemeng and Wu, Rong and Cai, Pinlong and Cai, Xinyu and Gao, Xing and Yang, Yu and others},
  journal = {arXiv preprint arXiv:2505.16582},
  year    = {2025}
}

@article{dao2025rezero,
  title   = {ReZero: Enhancing LLM search ability by trying one-more-time},
  author  = {Dao, Alan and Le, Thinh},
  journal = {arXiv preprint arXiv:2504.11001},
  year    = {2025}
}

@article{jiang2025s3,
  title   = {s3: You Don't Need That Much Data to Train a Search Agent via RL},
  author  = {Jiang, Pengcheng and Xu, Xueqiang and Lin, Jiacheng and Xiao, Jinfeng and Wang, Zifeng and Sun, Jimeng and Han, Jiawei},
  journal = {arXiv preprint arXiv:2505.14146},
  year    = {2025}
}

@article{fan2025ssrl,
  title   = {SSRL: Self-Search Reinforcement Learning},
  author  = {Fan, Yuchen and Zhang, Kaiyan and Zhou, Heng and Zuo, Yuxin and Chen, Yanxu and Fu, Yu and Long, Xinwei and Zhu, Xuekai and Jiang, Che and Zhang, Yuchen and others},
  journal = {arXiv preprint arXiv:2508.10874},
  year    = {2025}
}

@article{nakano2021webgpt,
  title   = {Webgpt: Browser-assisted question-answering with human feedback},
  author  = {Nakano, Reiichiro and Hilton, Jacob and Balaji, Suchir and Wu, Jeff and Ouyang, Long and Kim, Christina and Hesse, Christopher and Jain, Shantanu and Kosaraju, Vineet and Saunders, William and others},
  journal = {arXiv preprint arXiv:2112.09332},
  year    = {2021}
}

@article{wei2025webagent,
  title   = {Webagent-r1: Training web agents via end-to-end multi-turn reinforcement learning},
  author  = {Wei, Zhepei and Yao, Wenlin and Liu, Yao and Zhang, Weizhi and Lu, Qin and Qiu, Liang and Yu, Changlong and Xu, Puyang and Zhang, Chao and Yin, Bing and others},
  journal = {arXiv preprint arXiv:2505.16421},
  year    = {2025}
}

% Latent Reasoning
@article{wei2022chain,
  title   = {Chain-of-thought prompting elicits reasoning in large language models},
  author  = {Wei, Jason and Wang, Xuezhi and Schuurmans, Dale and Bosma, Maarten and Xia, Fei and Chi, Ed and Le, Quoc V and Zhou, Denny and others},
  journal = {Advances in neural information processing systems},
  volume  = {35},
  pages   = {24824--24837},
  year    = {2022}
}

@article{yao2023tree,
  title   = {Tree of thoughts: Deliberate problem solving with large language models},
  author  = {Yao, Shunyu and Yu, Dian and Zhao, Jeffrey and Shafran, Izhak and Griffiths, Tom and Cao, Yuan and Narasimhan, Karthik},
  journal = {Advances in neural information processing systems},
  volume  = {36},
  pages   = {11809--11822},
  year    = {2023}
}

@inproceedings{yao2023react,
  title     = {React: Synergizing reasoning and acting in language models},
  author    = {Yao, Shunyu and Zhao, Jeffrey and Yu, Dian and Du, Nan and Shafran, Izhak and Narasimhan, Karthik and Cao, Yuan},
  booktitle = {International Conference on Learning Representations (ICLR)},
  year      = {2023}
}

@article{hua2024intuitive,
  title   = {Intuitive fine-tuning: Towards simplifying alignment into a single process},
  author  = {Hua, Ermo and Qi, Biqing and Zhang, Kaiyan and Yu, Yue and Ding, Ning and Lv, Xingtai and Tian, Kai and Zhou, Bowen},
  journal = {arXiv preprint arXiv:2405.11870},
  year    = {2024}
}

@article{geiping2025scaling,
  title={Scaling up test-time compute with latent reasoning: A recurrent depth approach},
  author={Geiping, Jonas and McLeish, Sean and Jain, Neel and Kirchenbauer, John and Singh, Siddharth and Bartoldson, Brian R and Kailkhura, Bhavya and Bhatele, Abhinav and Goldstein, Tom},
  journal={arXiv preprint arXiv:2502.05171},
  year={2025}
}

@article{hao2024training,
  title   = {Training large language models to reason in a continuous latent space},
  author  = {Hao, Shibo and Sukhbaatar, Sainbayar and Su, DiJia and Li, Xian and Hu, Zhiting and Weston, Jason and Tian, Yuandong},
  journal = {arXiv preprint arXiv:2412.06769},
  year    = {2024}
}

@article{ouyang2022training,
  title   = {Training language models to follow instructions with human feedback},
  author  = {Ouyang, Long and Wu, Jeffrey and Jiang, Xu and Almeida, Diogo and Wainwright, Carroll and Mishkin, Pamela and Zhang, Chong and Agarwal, Sandhini and Slama, Katarina and Ray, Alex and others},
  journal = {Advances in neural information processing systems},
  volume  = {35},
  pages   = {27730--27744},
  year    = {2022}
}

@article{mitra2024orca,
  title   = {Orca-math: Unlocking the potential of slms in grade school math},
  author  = {Mitra, Arindam and Khanpour, Hamed and Rosset, Corby and Awadallah, Ahmed},
  journal = {arXiv preprint arXiv:2402.14830},
  year    = {2024}
}


@article{arriola2025block,
  title   = {Block diffusion: Interpolating between autoregressive and diffusion language models},
  author  = {Arriola, Marianne and Gokaslan, Aaron and Chiu, Justin T and Yang, Zhihan and Qi, Zhixuan and Han, Jiaqi and Sahoo, Subham Sekhar and Kuleshov, Volodymyr},
  journal = {arXiv preprint arXiv:2503.09573},
  year    = {2025}
}

# xxx

@article{geng2025webwatcher,
  title   = {WebWatcher: Breaking New Frontiers of Vision-Language Deep Research Agent},
  author  = {Geng, Xinyu and Xia, Peng and Zhang, Zhen and Wang, Xinyu and Wang, Qiuchen and Ding, Ruixue and Wang, Chenxi and Wu, Jialong and Zhao, Yida and Li, Kuan and others},
  journal = {arXiv preprint arXiv:2508.05748},
  year    = {2025}
}

@article{feng2025retool,
  title   = {Retool: Reinforcement learning for strategic tool use in llms},
  author  = {Feng, Jiazhan and Huang, Shijue and Qu, Xingwei and Zhang, Ge and Qin, Yujia and Zhong, Baoquan and Jiang, Chengquan and Chi, Jinxin and Zhong, Wanjun},
  journal = {arXiv preprint arXiv:2504.11536},
  year    = {2025}
}

@article{brown2020language,
  title   = {Language models are few-shot learners},
  author  = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others},
  journal = {Advances in neural information processing systems},
  volume  = {33},
  pages   = {1877--1901},
  year    = {2020}
}

@article{kaplan2020scaling,
  title   = {Scaling laws for neural language models},
  author  = {Kaplan, Jared and McCandlish, Sam and Henighan, Tom and Brown, Tom B and Chess, Benjamin and Child, Rewon and Gray, Scott and Radford, Alec and Wu, Jeffrey and Amodei, Dario},
  journal = {arXiv preprint arXiv:2001.08361},
  year    = {2020}
}

@article{achiam2023gpt,
  title   = {Gpt-4 technical report},
  author  = {Achiam, Josh and Adler, Steven and Agarwal, Sandhini and Ahmad, Lama and Akkaya, Ilge and Aleman, Florencia Leoni and Almeida, Diogo and Altenschmidt, Janko and Altman, Sam and Anadkat, Shyamal and others},
  journal = {arXiv preprint arXiv:2303.08774},
  year    = {2023}
}

@article{liu2024deepseek,
  title   = {Deepseek-v3 technical report},
  author  = {Liu, Aixin and Feng, Bei and Xue, Bing and Wang, Bingxuan and Wu, Bochao and Lu, Chengda and Zhao, Chenggang and Deng, Chengqi and Zhang, Chenyu and Ruan, Chong and others},
  journal = {arXiv preprint arXiv:2412.19437},
  year    = {2024}
}

@article{wei2022emergent,
  title   = {Emergent abilities of large language models},
  author  = {Wei, Jason and Tay, Yi and Bommasani, Rishi and Raffel, Colin and Zoph, Barret and Borgeaud, Sebastian and Yogatama, Dani and Bosma, Maarten and Zhou, Denny and Metzler, Donald and others},
  journal = {arXiv preprint arXiv:2206.07682},
  year    = {2022}
}

@article{radford2018improving,
  title     = {Improving language understanding by generative pre-training},
  author    = {Radford, Alec and Narasimhan, Karthik and Salimans, Tim and Sutskever, Ilya and others},
  year      = {2018},
  publisher = {San Francisco, CA, USA}
}

@book{sutton1998introduction,
  title     = {Introduction to reinforcement learning},
  author    = {Sutton, Richard S and Barto, Andrew G and others},
  volume    = {135},
  year      = {1998},
  publisher = {MIT press Cambridge}
}

@inproceedings{rein2024gpqa,
  title     = {Gpqa: A graduate-level google-proof q\&a benchmark},
  author    = {Rein, David and Hou, Betty Li and Stickland, Asa Cooper and Petty, Jackson and Pang, Richard Yuanzhe and Dirani, Julien and Michael, Julian and Bowman, Samuel R},
  booktitle = {First Conference on Language Modeling},
  year      = {2024}
}

@article{jimenez2023swe,
  title   = {Swe-bench: Can language models resolve real-world github issues?},
  author  = {Jimenez, Carlos E and Yang, John and Wettig, Alexander and Yao, Shunyu and Pei, Kexin and Press, Ofir and Narasimhan, Karthik},
  journal = {arXiv preprint arXiv:2310.06770},
  year    = {2023}
}

@article{jaech2024openai,
  title   = {Openai o1 system card},
  author  = {Jaech, Aaron and Kalai, Adam and Lerer, Adam and Richardson, Adam and El-Kishky, Ahmed and Low, Aiden and Helyar, Alec and Madry, Aleksander and Beutel, Alex and Carney, Alex and others},
  journal = {arXiv preprint arXiv:2412.16720},
  year    = {2024}
}

@article{openai-o3,
  title   = {OpenAI o3 and o4-mini System Card},
  author  = {OpenAI},
  journal = {Blog},
  year    = {2025}
}

@article{openai-gpt5,
  title   = {GPT-5 System Card},
  author  = {OpenAI},
  journal = {Blog},
  year    = {2025}
}

@inproceedings{aghajanyan2023scaling,
  title        = {Scaling laws for generative mixed-modal language models},
  author       = {Aghajanyan, Armen and Yu, Lili and Conneau, Alexis and Hsu, Wei-Ning and Hambardzumyan, Karen and Zhang, Susan and Roller, Stephen and Goyal, Naman and Levy, Omer and Zettlemoyer, Luke},
  booktitle    = {International Conference on Machine Learning},
  pages        = {265--279},
  year         = {2023},
  organization = {PMLR}
}

@article{hendrycks2020measuring,
  title   = {Measuring massive multitask language understanding},
  author  = {Hendrycks, Dan and Burns, Collin and Basart, Steven and Zou, Andy and Mazeika, Mantas and Song, Dawn and Steinhardt, Jacob},
  journal = {arXiv preprint arXiv:2009.03300},
  year    = {2020}
}

@article{chen2025towards,
  title   = {Towards reasoning era: A survey of long chain-of-thought for reasoning large language models},
  author  = {Chen, Qiguang and Qin, Libo and Liu, Jinhao and Peng, Dengyun and Guan, Jiannan and Wang, Peng and Hu, Mengkang and Zhou, Yuhang and Gao, Te and Che, Wanxiang},
  journal = {arXiv preprint arXiv:2503.09567},
  year    = {2025}
}


@article{feng2025video-r1,
  title   = {Video-r1: Reinforcing video reasoning in mllms},
  author  = {Feng, Kaituo and Gong, Kaixiong and Li, Bohao and Guo, Zonghao and Wang, Yibing and Peng, Tianshuo and Wu, Junfei and Zhang, Xiaoying and Wang, Benyou and Yue, Xiangyu},
  journal = {arXiv preprint arXiv:2503.21776},
  year    = {2025}
}

@article{dang2025focused-thinking,
  title   = {Reinforcing video reasoning with focused thinking},
  author  = {Dang, Jisheng and Wu, Jingze and Wang, Teng and Lin, Xuanhui and Zhu, Nannan and Chen, Hongbo and Zheng, Wei-Shi and Wang, Meng and Chua, Tat-Seng},
  journal = {arXiv preprint arXiv:2505.24718},
  year    = {2025}
}

@article{zhang2025vq-insight,
  title   = {VQ-Insight: Teaching VLMs for AI-Generated Video Quality Understanding via Progressive Visual Reinforcement Learning},
  author  = {Zhang, Xuanyu and Li, Weiqi and Zhao, Shijie and Li, Junlin and Zhang, Li and Zhang, Jian},
  journal = {arXiv preprint arXiv:2506.18564},
  year    = {2025}
}

@article{chen2025long-rl-video,
  title   = {Scaling rl to long videos},
  author  = {Chen, Yukang and Huang, Wei and Shi, Baifeng and Hu, Qinghao and Ye, Hanrong and Zhu, Ligeng and Liu, Zhijian and Molchanov, Pavlo and Kautz, Jan and Qi, Xiaojuan and others},
  journal = {arXiv preprint arXiv:2507.07966},
  year    = {2025}
}


@article{wang2025videorft,
  title   = {VideoRFT: Incentivizing Video Reasoning Capability in MLLMs via Reinforced Fine-Tuning},
  author  = {Wang, Qi and Yu, Yanrui and Yuan, Ye and Mao, Rui and Zhou, Tianfei},
  journal = {arXiv preprint arXiv:2505.12434},
  year    = {2025}
}


@article{li2025videochat-r1,
  title   = {Videochat-r1: Enhancing spatio-temporal perception via reinforcement fine-tuning},
  author  = {Li, Xinhao and Yan, Ziang and Meng, Desen and Dong, Lu and Zeng, Xiangyu and He, Yinan and Wang, Yali and Qiao, Yu and Wang, Yi and Wang, Limin},
  journal = {arXiv preprint arXiv:2504.06958},
  year    = {2025}
}


@article{lillicrap2015continuous,
  title   = {Continuous control with deep reinforcement learning},
  author  = {Lillicrap, Timothy P and Hunt, Jonathan J and Pritzel, Alexander and Heess, Nicolas and Erez, Tom and Tassa, Yuval and Silver, David and Wierstra, Daan},
  journal = {arXiv preprint arXiv:1509.02971},
  year    = {2015}
}

@inproceedings{anderson2018vision,
  title     = {Vision-and-language navigation: Interpreting visually-grounded navigation instructions in real environments},
  author    = {Anderson, Peter and Wu, Qi and Teney, Damien and Bruce, Jake and Johnson, Mark and S{\"u}nderhauf, Niko and Reid, Ian and Gould, Stephen and Van Den Hengel, Anton},
  booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages     = {3674--3683},
  year      = {2018}
}

@inproceedings{wang2018look,
  title     = {Look before you leap: Bridging model-free and model-based reinforcement learning for planned-ahead vision-and-language navigation},
  author    = {Wang, Xin and Xiong, Wenhan and Wang, Hongmin and Wang, William Yang},
  booktitle = {Proceedings of the European Conference on Computer Vision (ECCV)},
  pages     = {37--53},
  year      = {2018}
}

@inproceedings{wang2019reinforced,
  title     = {Reinforced cross-modal matching and self-supervised imitation learning for vision-language navigation},
  author    = {Wang, Xin and Huang, Qiuyuan and Celikyilmaz, Asli and Gao, Jianfeng and Shen, Dinghan and Wang, Yuan-Fang and Wang, William Yang and Zhang, Lei},
  booktitle = {Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
  pages     = {6629--6638},
  year      = {2019}
}

@inproceedings{ji2025robobrain,
  title     = {Robobrain: A unified brain model for robotic manipulation from abstract to concrete},
  author    = {Ji, Yuheng and Tan, Huajie and Shi, Jiayu and Hao, Xiaoshuai and Zhang, Yuan and Zhang, Hengyuan and Wang, Pengwei and Zhao, Mengdi and Mu, Yao and An, Pengju and others},
  booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference},
  pages     = {1724--1734},
  year      = {2025}
}

@inproceedings{zitkovich2023rt,
  title        = {Rt-2: Vision-language-action models transfer web knowledge to robotic control},
  author       = {Zitkovich, Brianna and Yu, Tianhe and Xu, Sichun and Xu, Peng and Xiao, Ted and Xia, Fei and Wu, Jialin and Wohlhart, Paul and Welker, Stefan and Wahid, Ayzaan and others},
  booktitle    = {Conference on Robot Learning},
  pages        = {2165--2183},
  year         = {2023},
  organization = {PMLR}
}

@article{li2025simplevla,
  title={SimpleVLA-RL: Scaling VLA Training via Reinforcement Learning},
  author={Li, Haozhan and Zuo, Yuxin and Yu, Jiale and Zhang, Yuhao and Yang, Zhaohui and Zhang, Kaiyan and Zhu, Xuekai and Zhang, Yuchen and Chen, Tianxing and Cui, Ganqu and others},
  journal={arXiv preprint arXiv:2509.09674},
  year={2025}
}

@article{lu2025vla,
  title   = {Vla-rl: Towards masterful and general robotic manipulation with scalable reinforcement learning},
  author  = {Lu, Guanxing and Guo, Wenkai and Zhang, Chubin and Zhou, Yuheng and Jiang, Haonan and Gao, Zifeng and Tang, Yansong and Wang, Ziwei},
  journal = {arXiv preprint arXiv:2505.18719},
  year    = {2025}
}

@article{liu2025can,
  title   = {What can rl bring to vla generalization? an empirical study},
  author  = {Liu, Jijia and Gao, Feng and Wei, Bingwen and Chen, Xinlei and Liao, Qingmin and Wu, Yi and Yu, Chao and Wang, Yu},
  journal = {arXiv preprint arXiv:2505.19789},
  year    = {2025}
}

@article{tan2025interactive,
  title   = {Interactive Post-Training for Vision-Language-Action Models},
  author  = {Tan, Shuhan and Dou, Kairan and Zhao, Yue and Kr{\"a}henb{\"u}hl, Philipp},
  journal = {arXiv preprint arXiv:2505.17016},
  year    = {2025}
}

@article{chen2025conrft,
  title   = {Conrft: A reinforced fine-tuning method for vla models via consistency policy},
  author  = {Chen, Yuhui and Tian, Shuai and Liu, Shugao and Zhou, Yingting and Li, Haoran and Zhao, Dongbin},
  journal = {arXiv preprint arXiv:2502.05450},
  year    = {2025}
}

@article{kim2025fine,
  title   = {Fine-tuning vision-language-action models: Optimizing speed and success},
  author  = {Kim, Moo Jin and Finn, Chelsea and Liang, Percy},
  journal = {arXiv preprint arXiv:2502.19645},
  year    = {2025}
}

@article{black2024pi_0,
  title   = {pi0: A Vision-Language-Action Flow Model for General Robot Control},
  author  = {Black, Kevin and Brown, Noah and Driess, Danny and Esmail, Adnan and Equi, Michael and Finn, Chelsea and Fusai, Niccolo and Groom, Lachy and Hausman, Karol and Ichter, Brian and others},
  journal = {arXiv preprint arXiv:2410.24164},
  year    = {2024}
}

@article{busoniu2008comprehensive,
  title     = {A comprehensive survey of multiagent reinforcement learning},
  author    = {Busoniu, Lucian and Babuska, Robert and De Schutter, Bart},
  journal   = {IEEE Transactions on Systems, Man, and Cybernetics, Part C (Applications and Reviews)},
  volume    = {38},
  number    = {2},
  pages     = {156--172},
  year      = {2008},
  publisher = {IEEE}
}

@article{wu2025wechat,
  title   = {WeChat-YATT: A Simple, Scalable and Balanced RLHF Trainer},
  author  = {Wu, Junyu and Chang, Weiming and Liu, Xiaotao and He, Guanyou and Xian, Tingfeng and Hong, Haoqiang and Chen, Boqi and Tian, Haotao and Yang, Tao and Shi, Yunsheng and others},
  journal = {arXiv preprint arXiv:2508.07970},
  year    = {2025}
}

@article{dorri2018multi,
  title     = {Multi-agent systems: A survey},
  author    = {Dorri, Ali and Kanhere, Salil S and Jurdak, Raja},
  journal   = {Ieee Access},
  volume    = {6},
  pages     = {28573--28593},
  year      = {2018},
  publisher = {IEEE}
}

@article{canese2021multi,
  title     = {Multi-agent reinforcement learning: A review of challenges and applications},
  author    = {Canese, Lorenzo and Cardarilli, Gian Carlo and Di Nunzio, Luca and Fazzolari, Rocco and Giardino, Daniele and Re, Marco and Span{\`o}, Sergio},
  journal   = {Applied Sciences},
  volume    = {11},
  number    = {11},
  pages     = {4948},
  year      = {2021},
  publisher = {MDPI}
}

@article{lowe2017multi,
  title   = {Multi-agent actor-critic for mixed cooperative-competitive environments},
  author  = {Lowe, Ryan and Wu, Yi I and Tamar, Aviv and Harb, Jean and Pieter Abbeel, OpenAI and Mordatch, Igor},
  journal = {Advances in neural information processing systems},
  volume  = {30},
  year    = {2017}
}

@article{rashid2020monotonic,
  title   = {Monotonic value function factorisation for deep multi-agent reinforcement learning},
  author  = {Rashid, Tabish and Samvelyan, Mikayel and De Witt, Christian Schroeder and Farquhar, Gregory and Foerster, Jakob and Whiteson, Shimon},
  journal = {Journal of Machine Learning Research},
  volume  = {21},
  number  = {178},
  pages   = {1--51},
  year    = {2020}
}

@article{kwiatkowski2019natural,
  title     = {Natural questions: a benchmark for question answering research},
  author    = {Kwiatkowski, Tom and Palomaki, Jennimaria and Redfield, Olivia and Collins, Michael and Parikh, Ankur and Alberti, Chris and Epstein, Danielle and Polosukhin, Illia and Devlin, Jacob and Lee, Kenton and others},
  journal   = {Transactions of the Association for Computational Linguistics},
  volume    = {7},
  pages     = {453--466},
  year      = {2019},
  publisher = {MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~…}
}

@article{yang2018hotpotqa,
  title   = {HotpotQA: A dataset for diverse, explainable multi-hop question answering},
  author  = {Yang, Zhilin and Qi, Peng and Zhang, Saizheng and Bengio, Yoshua and Cohen, William W and Salakhutdinov, Ruslan and Manning, Christopher D},
  journal = {arXiv preprint arXiv:1809.09600},
  year    = {2018}
}

@article{trivedi2022musique,
  title     = {MuSiQue: Multihop Questions via Single-hop Question Composition},
  author    = {Trivedi, Harsh and Balasubramanian, Niranjan and Khot, Tushar and Sabharwal, Ashish},
  journal   = {Transactions of the Association for Computational Linguistics},
  volume    = {10},
  pages     = {539--554},
  year      = {2022},
  publisher = {MIT Press One Broadway, 12th Floor, Cambridge, Massachusetts 02142, USA~…}
}

@inproceedings{foerster2018counterfactual,
  title     = {Counterfactual multi-agent policy gradients},
  author    = {Foerster, Jakob and Farquhar, Gregory and Afouras, Triantafyllos and Nardelli, Nantas and Whiteson, Shimon},
  booktitle = {Proceedings of the AAAI conference on artificial intelligence},
  volume    = {32},
  year      = {2018}
}

@article{zhang2023proagent,
  title   = {Proagent: Building proactive cooperative ai with large language models},
  author  = {Zhang, Ceyao and Yang, Kaijie and Hu, Siyi and Wang, Zihao and Li, Guanghe and Sun, Yihang and Zhang, Cheng and Zhang, Zhaowei and Liu, Anji and Zhu, Song-Chun and others},
  journal = {CoRR},
  year    = {2023}
}

@inproceedings{ding2023entity,
  title        = {Entity divider with language grounding in multi-agent reinforcement learning},
  author       = {Ding, Ziluo and Zhang, Wanpeng and Yue, Junpeng and Wang, Xiangjun and Huang, Tiejun and Lu, Zongqing},
  booktitle    = {International Conference on Machine Learning},
  pages        = {8103--8119},
  year         = {2023},
  organization = {PMLR}
}

@article{li2023theory,
  title   = {Theory of mind for multi-agent collaboration via large language models},
  author  = {Li, Huao and Chong, Yu Quan and Stepputtis, Simon and Campbell, Joseph and Hughes, Dana and Lewis, Michael and Sycara, Katia},
  journal = {arXiv preprint arXiv:2310.10701},
  year    = {2023}
}

@article{zhang2023controlling,
  title   = {Controlling large language model-based agents for large-scale decision-making: An actor-critic approach},
  author  = {Zhang, Bin and Mao, Hangyu and Ruan, Jingqing and Wen, Ying and Li, Yang and Zhang, Shao and Xu, Zhiwei and Li, Dapeng and Li, Ziyue and Zhao, Rui and others},
  journal = {arXiv preprint arXiv:2311.13884},
  year    = {2023}
}

@article{xie2025teaching,
  title   = {Teaching language models to critique via reinforcement learning},
  author  = {Xie, Zhihui and Chen, Liyu and Mao, Weichao and Xu, Jingjing and Kong, Lingpeng and others},
  journal = {arXiv preprint arXiv:2502.03492},
  year    = {2025}
}

@article{liu2025llm,
  title   = {LLM Collaboration With Multi-Agent Reinforcement Learning},
  author  = {Liu, Shuo and Liang, Zeyu and Lyu, Xueguang and Amato, Christopher},
  journal = {arXiv preprint arXiv:2508.04652},
  year    = {2025}
}


@inproceedings{lin2016fixed,
  title        = {Fixed point quantization of deep convolutional networks},
  author       = {Lin, Darryl and Talathi, Sachin and Annapureddy, Sreekanth},
  booktitle    = {International conference on machine learning},
  pages        = {2849--2858},
  year         = {2016},
  organization = {PMLR}
}

@article{wu2025generalization,
  title   = {On the Generalization of SFT: A Reinforcement Learning Perspective with Reward Rectification},
  author  = {Wu, Yongliang and Zhou, Yizhou and Ziheng, Zhou and Peng, Yingzhe and Ye, Xinyu and Hu, Xinting and Zhu, Wenbo and Qi, Lu and Yang, Ming-Hsuan and Yang, Xu},
  journal = {arXiv preprint arXiv:2508.05629},
  year    = {2025}
}

@article{zeng2025glm,
  title   = {GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models},
  author  = {Zeng, Aohan and Lv, Xin and Zheng, Qinkai and Hou, Zhenyu and Chen, Bin and Xie, Chengxing and Wang, Cunxiang and Yin, Da and Zeng, Hao and Zhang, Jiajie and others},
  journal = {arXiv preprint arXiv:2508.06471},
  year    = {2025}
}

@article{song2025r1++,
  title   = {R1-Searcher++: Incentivizing the Dynamic Knowledge Acquisition of LLMs via Reinforcement Learning},
  author  = {Song, Huatong and Jiang, Jinhao and Tian, Wenqing and Chen, Zhipeng and Wu, Yuhuan and Zhao, Jiahao and Min, Yingqian and Zhao, Wayne Xin and Fang, Lei and Wen, Ji-Rong},
  journal = {arXiv preprint arXiv:2505.17005},
  year    = {2025}
}

@article{gao2025beyond,
  title   = {Beyond Ten Turns: Unlocking Long-Horizon Agentic Search with Large-Scale Asynchronous RL},
  author  = {Gao, Jiaxuan and Fu, Wei and Xie, Minyang and Xu, Shusheng and He, Chuyi and Mei, Zhiyu and Zhu, Banghua and Wu, Yi},
  journal = {arXiv preprint arXiv:2508.07976},
  year    = {2025}
}

@article{chen2025pass,
  title   = {Pass@k Training for Adaptively Balancing Exploration and Exploitation of Large Reasoning Models},
  author  = {Chen, Zhipeng and Qin, Xiaobo and Wu, Youbin and Ling, Yue and Ye, Qinghao and Zhao, Wayne Xin and Shi, Guang},
  journal = {arXiv preprint arXiv:2508.10751},
  year    = {2025}
}

@article{walder2025pass,
  title   = {Pass@K Policy Optimization: Solving Harder Reinforcement Learning Problems},
  author  = {Walder, Christian and Karkhanis, Deep},
  journal = {arXiv preprint arXiv:2505.15201},
  year    = {2025}
}

@article{liu2025learn,
  title   = {Learn to reason efficiently with adaptive length-based reward shaping},
  author  = {Liu, Wei and Zhou, Ruochen and Deng, Yiyun and Huang, Yuzhen and Liu, Junteng and Deng, Yuntian and Zhang, Yizhe and He, Junxian},
  journal = {arXiv preprint arXiv:2505.15612},
  year    = {2025}
}


@article{naik2024reward,
  title   = {Reward centering},
  author  = {Naik, Abhishek and Wan, Yi and Tomar, Manan and Sutton, Richard S},
  journal = {arXiv preprint arXiv:2405.09999},
  year    = {2024}
}

@article{gupta2022unpacking,
  title   = {Unpacking reward shaping: Understanding the benefits of reward engineering on sample complexity},
  author  = {Gupta, Abhishek and Pacchiano, Aldo and Zhai, Yuexiang and Kakade, Sham and Levine, Sergey},
  journal = {Advances in Neural Information Processing Systems},
  volume  = {35},
  pages   = {15281--15295},
  year    = {2022}
}

@article{goyal2019using,
  title   = {Using natural language for reward shaping in reinforcement learning},
  author  = {Goyal, Prasoon and Niekum, Scott and Mooney, Raymond J},
  journal = {arXiv preprint arXiv:1903.02020},
  year    = {2019}
}

@article{xie2023text2reward,
  title   = {Text2reward: Reward shaping with language models for reinforcement learning},
  author  = {Xie, Tianbao and Zhao, Siheng and Wu, Chen Henry and Liu, Yitao and Luo, Qian and Zhong, Victor and Yang, Yanchao and Yu, Tao},
  journal = {arXiv preprint arXiv:2309.11489},
  year    = {2023}
}

@article{hu2020learning,
  title   = {Learning to utilize shaping rewards: A new approach of reward shaping},
  author  = {Hu, Yujing and Wang, Weixun and Jia, Hangtian and Wang, Yixiang and Chen, Yingfeng and Hao, Jianye and Wu, Feng and Fan, Changjie},
  journal = {Advances in Neural Information Processing Systems},
  volume  = {33},
  pages   = {15931--15941},
  year    = {2020}
}

@article{yang2024qwen2,
  title   = {Qwen2. 5-math technical report: Toward mathematical expert model via self-improvement},
  author  = {Yang, An and Zhang, Beichen and Hui, Binyuan and Gao, Bofei and Yu, Bowen and Li, Chengpeng and Liu, Dayiheng and Tu, Jianhong and Zhou, Jingren and Lin, Junyang and others},
  journal = {arXiv preprint arXiv:2409.12122},
  year    = {2024}
}

@article{yue2025does,
  title   = {Does reinforcement learning really incentivize reasoning capacity in llms beyond the base model?},
  author  = {Yue, Yang and Chen, Zhiqi and Lu, Rui and Zhao, Andrew and Wang, Zhaokai and Song, Shiji and Huang, Gao},
  journal = {arXiv preprint arXiv:2504.13837},
  year    = {2025}
}

@article{setlur2025e3,
  title   = {e3: Learning to Explore Enables Extrapolation of Test-Time Compute for LLMs},
  author  = {Setlur, Amrith and Yang, Matthew YR and Snell, Charlie and Greer, Jeremy and Wu, Ian and Smith, Virginia and Simchowitz, Max and Kumar, Aviral},
  journal = {arXiv preprint arXiv:2506.09026},
  year    = {2025}
}

@article{li2024prioritized,
  title     = {Prioritized experience replay based on dynamics priority},
  author    = {Li, Hu and Qian, Xuezhong and Song, Wei},
  journal   = {Scientific Reports},
  volume    = {14},
  number    = {1},
  pages     = {6014},
  year      = {2024},
  publisher = {Nature Publishing Group UK London}
}

@article{fujita2025experience,
  title   = {Experience Replay with Random Reshuffling},
  author  = {Fujita, Yasuhiro},
  journal = {arXiv preprint arXiv:2503.02269},
  year    = {2025}
}

@inproceedings{kwon2023efficient,
  title     = {Efficient Memory Management for Large Language Model Serving with PagedAttention},
  author    = {Woosuk Kwon and Zhuohan Li and Siyuan Zhuang and Ying Sheng and Lianmin Zheng and Cody Hao Yu and Joseph E. Gonzalez and Hao Zhang and Ion Stoica},
  booktitle = {Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles},
  year      = {2023}
}

@article{megatron-lm,
  title   = {Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism},
  author  = {Shoeybi, Mohammad and Patwary, Mostofa and Puri, Raul and LeGresley, Patrick and Casper, Jared and Catanzaro, Bryan},
  journal = {arXiv preprint arXiv:1909.08053},
  year    = {2019}
}

@inproceedings{rasley2020deepspeed,
  title     = {Deepspeed: System optimizations enable training deep learning models with over 100 billion parameters},
  author    = {Rasley, Jeff and Rajbhandari, Samyam and Ruwase, Olatunji and He, Yuxiong},
  booktitle = {Proceedings of the 26th ACM SIGKDD international conference on knowledge discovery \& data mining},
  pages     = {3505--3506},
  year      = {2020}
}

@article{zhao2023pytorch,
  title   = {Pytorch fsdp: experiences on scaling fully sharded data parallel},
  author  = {Zhao, Yanli and Gu, Andrew and Varma, Rohan and Luo, Liang and Huang, Chien-Chin and Xu, Min and Wright, Less and Shojanazeri, Hamid and Ott, Myle and Shleifer, Sam and others},
  journal = {arXiv preprint arXiv:2304.11277},
  year    = {2023}
}

@article{yang2025qwen3,
  title   = {Qwen3 Technical Report},
  author  = {Yang, An and Li, Anfeng and Yang, Baosong and Zhang, Beichen and Hui, Binyuan and Zheng, Bo and Yu, Bowen and Gao, Chang and Huang, Chengen and Lv, Chenxu and others},
  year    = {2025},
  journal = {arxiv preprint arXiv: 2505.09388}
}

@article{bercovich2025llama-nemotron,
  title   = {Llama-Nemotron: Efficient Reasoning Models},
  author  = {Bercovich, Akhiad and Levy, Itay and Golan, Izik and Dabbah, Mohammad and El-Yaniv, Ran and Puny, Omri and Galil, Ido and Moshe, Zach and Ronen, Tomer and Nabwani, Najeeb and others},
  year    = {2025},
  journal = {arxiv preprint arXiv: 2505.00949}
}

@article{mistralai2025magistral,
  title   = {Magistral},
  author  = {Rastogi, Abhinav and Jiang, Albert Q. and Lo, Andy and Berrada, Gabrielle and Lample, Guillaume and Rute, Jason and Barmentlo, Joep and Yadav, Karmesh and Khandelwal, Kartik and Chandu, Khyathi Raghavi and others},
  year    = {2025},
  journal = {arxiv preprint arXiv: 2506.10910}
}

@article{chen2024not,
  title   = {Do not think that much for 2+ 3=? on the overthinking of o1-like llms},
  author  = {Chen, Xingyu and Xu, Jiahao and Liang, Tian and He, Zhiwei and Pang, Jianhui and Yu, Dian and Song, Linfeng and Liu, Qiuzhi and Zhou, Mengfei and Zhang, Zhuosheng and others},
  journal = {arXiv preprint arXiv:2412.21187},
  year    = {2024}
}

@article{openai2025gpt-oss-120b,
  title   = {gpt-oss-120b \& gpt-oss-20b Model Card},
  author  = {Agarwal, Sandhini and Ahmad, Lama and Ai, Jason and Altman, Sam and Applebaum, Andy and Arbus, Edwin and Arora, Rahul K. and Bai, Yu and Baker, Bowen and Bao, Haiming and others},
  year    = {2025},
  journal = {arxiv preprint arXiv: 2508.10925}
}

@article{comanici2025gemini25,
  title   = {Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities},
  author  = {Comanici, Gheorghe and Bieber, Eric and Schaekermann, Mike and Pasupat, Ice and Sachdeva, Noveen and Dhillon, Inderjit and Blistein, Marcel and Ram, Ori and Zhang, Dan and Rosen, Evan and others},
  year    = {2025},
  journal = {arxiv preprint arXiv: 2507.06261}
}

@misc{anthropic2025claude,
  title  = {Claude 3.7 Sonnet and Claude Code},
  url    = {https://www.anthropic.com/news/claude-3-7-sonnet},
  author = {{Anthropic}},
  year   = {2025}
}

@misc{zheng2025easyr1,
  title        = {EasyR1: An Efficient, Scalable, Multi-Modality RL Training Framework},
  author       = {Zheng, Yaowei and Lu, Junting and  Wang, Shenzhi and Feng, Zhangchi and Kuang, Dongdong and Xiong, Yuwen },
  howpublished = {\url{https://github.com/hiyouga/EasyR1}},
  year         = {2025}
}

@misc{marti2025,
  title       = {MARTI: A Framework for Multi-Agent LLM Systems Reinforced Training and Inference},
  author      = {Zhang, Kaiyan and Liu, Runze and Zhu, Xuekai and Tian, Kai and Zeng, Sihang and Jia, Guoli and Fan, Yuchen and Lv, Xingtai and Zuo, Yuxin and Jiang, Che and Liu, Ziyang and Wang, Jianyu and Wang, Yuru and Zhao, Ruotong and Hua, Ermo and Wang, Yibo and Wang, Shijie and Gao, Junqi and Long, Xinwei and Sun, Youbang and Ma, Zhiyuan and Cui, Ganqu and Bai, Lei and Ding, Ning and Qi, Biqing and Zhou, Bowen},
  year        = {2025},
  institution = {Tsinghua University and Shanghai AI Lab},
  url         = {https://github.com/TsinghuaC3I/MARTI}
}

@article{luo2024survey,
  title     = {A survey on model-based reinforcement learning},
  author    = {Luo, Fan-Ming and Xu, Tian and Lai, Hang and Chen, Xiong-Hui and Zhang, Weinan and Yu, Yang},
  journal   = {Science China Information Sciences},
  volume    = {67},
  number    = {2},
  pages     = {121101},
  year      = {2024},
  publisher = {Springer}
}

@article{moerland2023model,
  title     = {Model-based reinforcement learning: A survey},
  author    = {Moerland, Thomas M and Broekens, Joost and Plaat, Aske and Jonker, Catholijn M and others},
  journal   = {Foundations and Trends{\textregistered} in Machine Learning},
  volume    = {16},
  number    = {1},
  pages     = {1--118},
  year      = {2023},
  publisher = {Now Publishers, Inc.}
}

@article{benechehab2024zero,
  title   = {Zero-shot model-based reinforcement learning using large language models},
  author  = {Benechehab, Abdelhakim and Hili, Youssef Attia El and Odonnat, Ambroise and Zekri, Oussama and Thomas, Albert and Paolo, Giuseppe and Filippone, Maurizio and Redko, Ievgen and K{\'e}gl, Bal{\'a}zs},
  journal = {arXiv preprint arXiv:2410.11711},
  year    = {2024}
}

@article{hu2023language,
  title   = {Language models, agent models, and world models: The law for machine reasoning and planning},
  author  = {Hu, Zhiting and Shu, Tianmin},
  journal = {arXiv preprint arXiv:2312.05230},
  year    = {2023}
}

@article{gu2024your,
  title   = {Is your llm secretly a world model of the internet? model-based planning for web agents},
  author  = {Gu, Yu and Zhang, Kai and Ning, Yuting and Zheng, Boyuan and Gou, Boyu and Xue, Tianci and Chang, Cheng and Srivastava, Sanjari and Xie, Yanan and Qi, Peng and others},
  journal = {arXiv preprint arXiv:2411.06559},
  year    = {2024}
}

@article{assran2025v,
  title   = {V-jepa 2: Self-supervised video models enable understanding, prediction and planning},
  author  = {Assran, Mido and Bardes, Adrien and Fan, David and Garrido, Quentin and Howes, Russell and Muckley, Matthew and Rizvi, Ammar and Roberts, Claire and Sinha, Koustuv and Zholus, Artem and others},
  journal = {arXiv preprint arXiv:2506.09985},
  year    = {2025}
}

@misc{avatarl2025,
  author    = {tokenbender},
  title     = {avatarl: training language models from scratch with pure reinforcement learning},
  year      = {2025},
  publisher = {github},
  journal   = {github repository},
  url       = {https://github.com/tokenbender/avatarl}
}

@article{su2025thinking,
  title   = {Thinking with images for multimodal reasoning: Foundations, methods, and future frontiers},
  author  = {Su, Zhaochen and Xia, Peng and Guo, Hangyu and Liu, Zhenhua and Ma, Yan and Qu, Xiaoye and Liu, Jiaqi and Li, Yanshu and Zeng, Kaide and Yang, Zhengyuan and others},
  journal = {arXiv preprint arXiv:2506.23918},
  year    = {2025}
}

@article{wu2025rlvr,
  title   = {RLVR-World: Training World Models with Reinforcement Learning},
  author  = {Wu, Jialong and Yin, Shaofeng and Feng, Ningya and Long, Mingsheng},
  journal = {arXiv preprint arXiv:2505.13934},
  year    = {2025}
}

@article{wang2025thoughts,
  title   = {Thoughts are all over the place: On the underthinking of o1-like llms},
  author  = {Wang, Yue and Liu, Qiuzhi and Xu, Jiahao and Liang, Tian and Chen, Xingyu and He, Zhiwei and Song, Linfeng and Yu, Dian and Li, Juntao and Zhang, Zhuosheng and others},
  journal = {arXiv preprint arXiv:2501.18585},
  year    = {2025}
}

@article{sui2025stop,
  title   = {Stop overthinking: A survey on efficient reasoning for large language models},
  author  = {Sui, Yang and Chuang, Yu-Neng and Wang, Guanchu and Zhang, Jiamu and Zhang, Tianyi and Yuan, Jiayi and Liu, Hongyi and Wen, Andrew and Zhong, Shaochen and Chen, Hanjie and others},
  journal = {arXiv preprint arXiv:2503.16419},
  year    = {2025}
}

@article{qi2024webrl,
  title   = {Webrl: Training llm web agents via self-evolving online curriculum reinforcement learning},
  author  = {Qi, Zehan and Liu, Xiao and Iong, Iat Long and Lai, Hanyu and Sun, Xueqiao and Zhao, Wenyi and Yang, Yu and Yang, Xinyue and Sun, Jiadai and Yao, Shuntian and others},
  journal = {arXiv preprint arXiv:2411.02337},
  year    = {2024}
}

@article{zhou2025academicbrowse,
  title   = {AcademicBrowse: Benchmarking Academic Browse Ability of LLMs},
  author  = {Zhou, Junting and Li, Wang and Liao, Yiyan and Zhang, Nengyuan and Qi, Tingjia Miaoand Zhihui and Wu, Yuhan and Yang, Tong},
  journal = {arXiv preprint arXiv:2506.13784},
  year    = {2025}
}

@article{vattikonda2025train,
  title   = {How to Train Your LLM Web Agent: A Statistical Diagnosis},
  author  = {Vattikonda, Dheeraj and Ravichandran, Santhoshi and Penaloza, Emiliano and Nekoei, Hadi and Thakkar, Megh and de Chezelles, Thibault Le Sellier and Gontier, Nicolas and Mu{\~n}oz-M{\'a}rmol, Miguel and Shayegan, Sahar Omidi and Raimondo, Stefania and others},
  journal = {arXiv preprint arXiv:2507.04103},
  year    = {2025}
}

@article{he2025thinkdial,
  title   = {ThinkDial: An Open Recipe for Controlling Reasoning Effort in Large Language Models},
  author  = {He, Qianyu and Yuan, Siyu and Li, Xuefeng and Wang, Mingxuan and Chen, Jiangjie},
  journal = {arXiv preprint arXiv:2508.18773},
  year    = {2025}
}

@article{borso2025preference,
  title   = {Preference-based alignment of discrete diffusion models},
  author  = {Borso, Umberto and Paglieri, Davide and Wells, Jude and Rockt{\"a}schel, Tim},
  journal = {arXiv preprint arXiv:2503.08295},
  year    = {2025}
}

@misc{JetAstra2025,
  title       = {SDAR: A Synergistic Diffusion–AutoRegression Paradigm for Scalable Sequence Generation},
  author      = {Shuang Cheng and Yihan Bian and Dawei Liu and Yuhua Jiang and Yihao Liu and Linfeng Zhang and Wenghai Wang and Qipeng Guo and Kai Chen and Biqing Qi* and Bowen Zhou},
  year        = {2025},
  institution = {Shanghai AI Lab},
  url         = {https://github.com/JetAstra/SDAR}
}

@inproceedings{yang2024using,
  title     = {Using human feedback to fine-tune diffusion models without any reward model},
  author    = {Yang, Kai and Tao, Jian and Lyu, Jiafei and Ge, Chunjiang and Chen, Jiaxin and Shen, Weihan and Zhu, Xiaolong and Li, Xiu},
  booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages     = {8941--8951},
  year      = {2024}
}

@article{tae2025tess,
  title   = {Tess 2: A large-scale generalist diffusion language model},
  author  = {Tae, Jaesung and Ivison, Hamish and Kumar, Sachin and Cohan, Arman},
  journal = {arXiv preprint arXiv:2502.13917},
  year    = {2025}
}

@article{zhu2025llada,
  title   = {LLaDA 1.5: Variance-Reduced Preference Optimization for Large Language Diffusion Models},
  author  = {Zhu, Fengqi and Wang, Rongzhen and Nie, Shen and Zhang, Xiaolu and Wu, Chunwei and Hu, Jun and Zhou, Jun and Chen, Jianfei and Lin, Yankai and Wen, Ji-Rong and others},
  journal = {arXiv preprint arXiv:2505.19223},
  year    = {2025}
}

@article{wang2025trado,
  title={Revolutionizing Reinforcement Learning Framework for Diffusion Large Language Models},
  author={Wang, Yinjie and Yang, Ling and Li, Bowen and Tian, Ye and Shen, Ke and Wang, Mengdi},
  journal={arXiv preprint arXiv:2509.06949},
  year={2025}
}

@article{labs2025mercury,
  title   = {Mercury: Ultra-Fast Language Models Based on Diffusion},
  author  = {Labs, Inception and Khanna, Samar and Kharbanda, Siddhant and Li, Shufan and Varma, Harshit and Wang, Eric and Birnbaum, Sawyer and Luo, Ziyang and Miraoui, Yanis and Palrecha, Akash and others},
  journal = {arXiv preprint arXiv:2506.17298},
  year    = {2025}
}

@article{yang2025mmada,
  title   = {Mmada: Multimodal large diffusion language models},
  author  = {Yang, Ling and Tian, Ye and Li, Bowen and Zhang, Xinchen and Shen, Ke and Tong, Yunhai and Wang, Mengdi},
  journal = {arXiv preprint arXiv:2505.15809},
  year    = {2025}
}

@article{gong2025diffucoder,
  title   = {DiffuCoder: Understanding and Improving Masked Diffusion Models for Code Generation},
  author  = {Gong, Shansan and Zhang, Ruixiang and Zheng, Huangjie and Gu, Jiatao and Jaitly, Navdeep and Kong, Lingpeng and Zhang, Yizhe},
  journal = {arXiv preprint arXiv:2506.20639},
  year    = {2025}
}

@article{song2025seed,
  title   = {Seed diffusion: A large-scale diffusion language model with high-speed inference},
  author  = {Song, Yuxuan and Zhang, Zheng and Luo, Cheng and Gao, Pengyang and Xia, Fan and Luo, Hao and Li, Zheng and Yang, Yuehang and Yu, Hongli and Qu, Xingwei and others},
  journal = {arXiv preprint arXiv:2508.02193},
  year    = {2025}
}

@article{he2025mdpo,
  title   = {MDPO: Overcoming the Training-Inference Divide of Masked Diffusion Language Models},
  author  = {He, Haoyu and Renz, Katrin and Cao, Yong and Geiger, Andreas},
  journal = {arXiv preprint arXiv:2508.13148},
  year    = {2025}
}

@misc{zhao2025d1scalingreasoningdiffusion,
  title         = {d1: Scaling Reasoning in Diffusion Large Language Models via Reinforcement Learning},
  author        = {Siyan Zhao and Devaansh Gupta and Qinqing Zheng and Aditya Grover},
  year          = {2025},
  eprint        = {2504.12216},
  archiveprefix = {arXiv},
  primaryclass  = {cs.CL},
  url           = {https://arxiv.org/abs/2504.12216}
}


@article{ye2025dream,
  title   = {Dream 7B: Diffusion Large Language Models},
  author  = {Ye, Jiacheng and Xie, Zhihui and Zheng, Lin and Gao, Jiahui and Wu, Zirui and Jiang, Xin and Li, Zhenguo and Kong, Lingpeng},
  journal = {arXiv preprint arXiv:2508.15487},
  year    = {2025}
}

@article{xie2025dream,
  title={Dream-Coder 7B: An Open Diffusion Language Model for Code},
  author={Xie, Zhihui and Ye, Jiacheng and Zheng, Lin and Gao, Jiahui and Dong, Jingwei and Wu, Zirui and Zhao, Xueliang and Gong, Shansan and Jiang, Xin and Li, Zhenguo and others},
  journal={arXiv preprint arXiv:2509.01142},
  year={2025}
}

@misc{nie2025largelanguagediffusionmodels,
  title         = {Large Language Diffusion Models},
  author        = {Shen Nie and Fengqi Zhu and Zebin You and Xiaolu Zhang and Jingyang Ou and Jun Hu and Jun Zhou and Yankai Lin and Ji-Rong Wen and Chongxuan Li},
  year          = {2025},
  eprint        = {2502.09992},
  archiveprefix = {arXiv},
  primaryclass  = {cs.CL},
  url           = {https://arxiv.org/abs/2502.09992}
}

@inproceedings{black2024training,
  title     = {Training Diffusion Models with Reinforcement Learning},
  author    = {Kevin Black and Michael Janner and Yilun Du and Ilya Kostrikov and Sergey Levine},
  booktitle = {The Twelfth International Conference on Learning Representations},
  year      = {2024},
  url       = {https://openreview.net/forum?id=YCWjhGrJFD}
}

@article{heuillet2025nested,
  title   = {Nested-ReFT: Efficient Reinforcement Learning for Large Language Model Fine-Tuning via Off-Policy Rollouts},
  author  = {Heuillet, Maxime and Cui, Yufei and Chen, Boxing and Durand, Audrey and Parthasarathi, Prasanna},
  journal = {arXiv preprint arXiv:2508.10123},
  year    = {2025}
}

@article{deng2025atom,
  title   = {Atom-Searcher: Enhancing Agentic Deep Research via Fine-Grained Atomic Thought Reward},
  author  = {Deng, Yong and Wang, Guoqing and Ying, Zhenzhe and Wu, Xiaofeng and Lin, Jinzhen and Xiong, Wenwen and Dai, Yuqin and Yang, Shuo and Zhang, Zhanwei and Wang, Qiwen and others},
  journal = {arXiv preprint arXiv:2508.12800},
  year    = {2025}
}

@article{yu2025medreseacher,
  title   = {MedReseacher-R1: Expert-Level Medical Deep Researcher via A Knowledge-Informed Trajectory Synthesis Framework},
  author  = {Yu, Ailing and Yao, Lan and Liu, Jingnan and Chen, Zhe and Yin, Jiajun and Wang, Yuan and Liao, Xinhao and Ye, Zhiling and Li, Ji and Yue, Yun and others},
  journal = {arXiv preprint arXiv:2508.14880},
  year    = {2025}
}

@article{yang2025depth,
  title   = {Depth-Breadth Synergy in RLVR: Unlocking LLM Reasoning Gains with Adaptive Exploration},
  author  = {Yang, Zhicheng and Guo, Zhijiang and Huang, Yinya and Wang, Yongxin and Xie, Dongchun and Wang, Yiwei and Liang, Xiaodan and Tang, Jing},
  journal = {arXiv preprint arXiv:2508.13755},
  year    = {2025}
}

@article{guo2025g,
  title   = {G2 RPO-A: Guided Group Relative Policy Optimization with Adaptive Guidance},
  author  = {Guo, Yongxin and Deng, Wenbo and Cheng, Zhenglin and Tang, Xiaoying},
  journal = {arXiv preprint arXiv:2508.13023},
  year    = {2025}
}

@article{agarwal2022reincarnating,
  title   = {Reincarnating reinforcement learning: Reusing prior computation to accelerate progress},
  author  = {Agarwal, Rishabh and Schwarzer, Max and Castro, Pablo Samuel and Courville, Aaron C and Bellemare, Marc},
  journal = {Advances in neural information processing systems},
  volume  = {35},
  pages   = {28955--28971},
  year    = {2022}
}

@article{chu2025sft,
  title   = {Sft memorizes, rl generalizes: A comparative study of foundation model post-training},
  author  = {Chu, Tianzhe and Zhai, Yuexiang and Yang, Jihan and Tong, Shengbang and Xie, Saining and Schuurmans, Dale and Le, Quoc V and Levine, Sergey and Ma, Yi},
  journal = {arXiv preprint arXiv:2501.17161},
  year    = {2025}
}

@article{wang2024generalization,
  title   = {Generalization vs Memorization: Tracing Language Models' Capabilities Back to Pretraining Data},
  author  = {Wang, Xinyi and Antoniades, Antonis and Elazar, Yanai and Amayuelas, Alfonso and Albalak, Alon and Zhang, Kexun and Wang, William Yang},
  journal = {arXiv preprint arXiv:2407.14985},
  year    = {2024}
}

@article{kirk2023understanding,
  title   = {Understanding the effects of rlhf on llm generalisation and diversity},
  author  = {Kirk, Robert and Mediratta, Ishita and Nalmpantis, Christoforos and Luketina, Jelena and Hambro, Eric and Grefenstette, Edward and Raileanu, Roberta},
  journal = {arXiv preprint arXiv:2310.06452},
  year    = {2023}
}

@article{swamy2025all,
  title   = {All roads lead to likelihood: The value of reinforcement learning in fine-tuning},
  author  = {Swamy, Gokul and Choudhury, Sanjiban and Sun, Wen and Wu, Zhiwei Steven and Bagnell, J Andrew},
  journal = {arXiv preprint arXiv:2503.01067},
  year    = {2025}
}

@article{liu2025uft,
  title   = {UFT: Unifying Supervised and Reinforcement Fine-Tuning},
  author  = {Liu, Mingyang and Farina, Gabriele and Ozdaglar, Asuman},
  journal = {arXiv preprint arXiv:2505.16984},
  year    = {2025}
}

@article{ma2025learning,
  title   = {Learning What Reinforcement Learning Can't: Interleaved Online Fine-Tuning for Hardest Questions},
  author  = {Ma, Lu and Liang, Hao and Qiang, Meiyi and Tang, Lexiang and Ma, Xiaochen and Wong, Zhen Hao and Niu, Junbo and Shen, Chengyu and He, Runming and Cui, Bin and others},
  journal = {arXiv preprint arXiv:2506.07527},
  year    = {2025}
}

@article{huan2025does,
  title   = {Does Math Reasoning Improve General LLM Capabilities? Understanding Transferability of LLM Reasoning},
  author  = {Huan, Maggie and Li, Yuetai and Zheng, Tuney and Xu, Xiaoyu and Kim, Seungone and Du, Minxin and Poovendran, Radha and Neubig, Graham and Yue, Xiang},
  journal = {arXiv preprint arXiv:2507.00432},
  year    = {2025}
}

@article{satvaty2024undesirable,
  title   = {Undesirable memorization in large language models: A survey},
  author  = {Satvaty, Ali and Verberne, Suzan and Turkmen, Fatih},
  journal = {arXiv preprint arXiv:2410.02650},
  year    = {2024}
}

@article{qin2025supervised,
  title   = {Supervised Fine Tuning on Curated Data is Reinforcement Learning (and can be improved)},
  author  = {Qin, Chongli and Springenberg, Jost Tobias},
  journal = {arXiv preprint arXiv:2507.12856},
  year    = {2025}
}

@article{zhang2025bread,
  title   = {BREAD: Branched Rollouts from Expert Anchors Bridge SFT and RL for Reasoning},
  author  = {Zhang, Xuechen and Huang, Zijian and Li, Yingcong and Ni, Chenshun and Chen, Jiasi and Oymak, Samet},
  journal = {arXiv preprint arXiv:2506.17211},
  year    = {2025}
}

@article{jin2025rl,
  title   = {RL Is Neither a Panacea Nor a Mirage: Understanding Supervised vs. Reinforcement Learning Fine-Tuning for LLMs},
  author  = {Jin, Hangzhan and Lv, Sicheng and Wu, Sifan and Hamdaqa, Mohammad},
  journal = {arXiv preprint arXiv:2508.16546},
  year    = {2025}
}

@article{fu2025srft,
  title   = {SRFT: A Single-Stage Method with Supervised and Reinforcement Fine-Tuning for Reasoning},
  author  = {Fu, Yuqian and Chen, Tinghong and Chai, Jiajun and Wang, Xihuai and Tu, Songjun and Yin, Guojun and Lin, Wei and Zhang, Qichao and Zhu, Yuanheng and Zhao, Dongbin},
  journal = {arXiv preprint arXiv:2506.19767},
  year    = {2025}
}

@article{zhu2025proximal,
  title   = {Proximal Supervised Fine-Tuning},
  author  = {Zhu, Wenhong and Xie, Ruobing and Wang, Rui and Sun, Xingwu and Wang, Di and Liu, Pengfei},
  journal = {arXiv preprint arXiv:2508.17784},
  year    = {2025}
}

@article{wen2025reinforcement,
  title   = {Reinforcement Learning with Verifiable Rewards Implicitly Incentivizes Correct Reasoning in Base LLMs},
  author  = {Wen, Xumeng and Liu, Zihan and Zheng, Shun and Xu, Zhijian and Ye, Shengyu and Wu, Zhirong and Liang, Xiao and Wang, Yang and Li, Junjie and Miao, Ziming and others},
  journal = {arXiv preprint arXiv:2506.14245},
  year    = {2025}
}

@article{liang2025beyond,
  title   = {Beyond Pass@ 1: Self-Play with Variational Problem Synthesis Sustains RLVR},
  author  = {Liang, Xiao and Li, Zhongzhi and Gong, Yeyun and Shen, Yelong and Wu, Ying Nian and Guo, Zhijiang and Chen, Weizhu},
  journal = {arXiv preprint arXiv:2508.14029},
  year    = {2025}
}

@misc{tinyzero,
  author       = {Jiayi Pan and Junjie Zhang and Xingyao Wang and Lifan Yuan and Hao Peng and Alane Suhr},
  title        = {TinyZero},
  howpublished = {https://github.com/Jiayi-Pan/TinyZero},
  note         = {Accessed: 2025-01-24},
  year         = {2025}
}

@article{wang2025octothinker,
  title   = {Octothinker: Mid-training incentivizes reinforcement learning scaling},
  author  = {Wang, Zengzhi and Zhou, Fan and Li, Xuefeng and Liu, Pengfei},
  journal = {arXiv preprint arXiv:2506.20512},
  year    = {2025}
}

@article{li2025thinking,
  title   = {When thinking fails: The pitfalls of reasoning for instruction-following in llms},
  author  = {Li, Xiaomin and Yu, Zhou and Zhang, Zhiwei and Chen, Xupeng and Zhang, Ziji and Zhuang, Yingying and Sadagopan, Narayanan and Beniwal, Anurag},
  journal = {arXiv preprint arXiv:2505.11423},
  year    = {2025}
}

@article{fu2025scaling,
  title   = {Scaling reasoning, losing control: Evaluating instruction following in large reasoning models},
  author  = {Fu, Tingchen and Gu, Jiawei and Li, Yafu and Qu, Xiaoye and Cheng, Yu},
  journal = {arXiv preprint arXiv:2505.14810},
  year    = {2025}
}

@article{xiong2025minimalist,
  title   = {A minimalist approach to llm reasoning: from rejection sampling to reinforce},
  author  = {Xiong, Wei and Yao, Jiarui and Xu, Yuhui and Pang, Bo and Wang, Lei and Sahoo, Doyen and Li, Junnan and Jiang, Nan and Zhang, Tong and Xiong, Caiming and others},
  journal = {arXiv preprint arXiv:2504.11343},
  year    = {2025}
}

@article{xiao2025bnpo,
  title   = {BNPO: Beta Normalization Policy Optimization},
  author  = {Xiao, Changyi and Zhang, Mengdi and Cao, Yixin},
  journal = {arXiv preprint arXiv:2506.02864},
  year    = {2025}
}

@article{nimmaturi2025predictive,
  title   = {Predictive Scaling Laws for Efficient GRPO Training of Large Reasoning Models},
  author  = {Nimmaturi, Datta and Bhargava, Vaishnavi and Ghosh, Rajat and George, Johnu and Dutta, Debojyoti},
  journal = {arXiv preprint arXiv:2507.18014},
  year    = {2025}
}

@article{dong2023raft,
  title   = {Raft: Reward ranked finetuning for generative foundation model alignment},
  author  = {Dong, Hanze and Xiong, Wei and Goyal, Deepanshu and Zhang, Yihan and Chow, Winnie and Pan, Rui and Diao, Shizhe and Zhang, Jipeng and Shum, Kashun and Zhang, Tong},
  journal = {arXiv preprint arXiv:2304.06767},
  year    = {2023}
}

@article{liu2023statistical,
  title   = {Statistical rejection sampling improves preference optimization},
  author  = {Liu, Tianqi and Zhao, Yao and Joshi, Rishabh and Khalman, Misha and Saleh, Mohammad and Liu, Peter J and Liu, Jialu},
  journal = {arXiv preprint arXiv:2309.06657},
  year    = {2023}
}

@article{silver2021reward,
  title     = {Reward is enough},
  author    = {Silver, David and Singh, Satinder and Precup, Doina and Sutton, Richard S},
  journal   = {Artificial intelligence},
  volume    = {299},
  pages     = {103535},
  year      = {2021},
  publisher = {Elsevier}
}

@article{el2025competitive,
  title   = {Competitive programming with large reasoning models},
  author  = {El-Kishky, Ahmed and Wei, Alexander and Saraiva, Andre and Minaiev, Borys and Selsam, Daniel and Dohan, David and Song, Francis and Lightman, Hunter and Clavera, Ignasi and Pachocki, Jakub and others},
  journal = {arXiv preprint arXiv:2502.06807},
  year    = {2025}
}

@inproceedings{bowling2023settling,
  title        = {Settling the reward hypothesis},
  author       = {Bowling, Michael and Martin, John D and Abel, David and Dabney, Will},
  booktitle    = {International Conference on Machine Learning},
  pages        = {3003--3020},
  year         = {2023},
  organization = {PMLR}
}

@article{arcuschin2025chain,
  title   = {Chain-of-thought reasoning in the wild is not always faithful},
  author  = {Arcuschin, Iv{\'a}n and Janiak, Jett and Krzyzanowski, Robert and Rajamanoharan, Senthooran and Nanda, Neel and Conmy, Arthur},
  journal = {arXiv preprint arXiv:2503.08679},
  year    = {2025}
}

@article{bai2025intern,
  title   = {Intern-S1: A Scientific Multimodal Foundation Model},
  author  = {Bai, Lei and Cai, Zhongrui and Cao, Maosong and Cao, Weihan and Chen, Chiyu and Chen, Haojiong and Chen, Kai and Chen, Pengcheng and Chen, Ying and Chen, Yongkang and others},
  journal = {arXiv preprint arXiv:2508.15763},
  year    = {2025}
}

@article{sun2025detection,
  title   = {Detection and Mitigation of Hallucination in Large Reasoning Models: A Mechanistic Perspective},
  author  = {Sun, Zhongxiang and Wang, Qipeng and Wang, Haoyu and Zhang, Xiao and Xu, Jun},
  journal = {arXiv preprint arXiv:2505.12886},
  year    = {2025}
}

@article{baker2025monitoring,
  title   = {Monitoring reasoning models for misbehavior and the risks of promoting obfuscation},
  author  = {Baker, Bowen and Huizinga, Joost and Gao, Leo and Dou, Zehao and Guan, Melody Y and Madry, Aleksander and Zaremba, Wojciech and Pachocki, Jakub and Farhi, David},
  journal = {arXiv preprint arXiv:2503.11926},
  year    = {2025}
}

@article{yin2025dynamic,
  title   = {Dynamic and Generalizable Process Reward Modeling},
  author  = {Yin, Zhangyue and Sun, Qiushi and Zeng, Zhiyuan and Cheng, Qinyuan and Qiu, Xipeng and Huang, Xuanjing},
  journal = {arXiv preprint arXiv:2507.17849},
  year    = {2025}
}

@article{zhou2025does,
  title   = {Does Learning Mathematical Problem-Solving Generalize to Broader Reasoning?},
  author  = {Zhou, Ruochen and Xu, Minrui and Chen, Shiqi and Liu, Junteng and Li, Yunqi and Lin, Xinxin and Chen, Zhengyu and He, Junxian},
  journal = {arXiv preprint arXiv:2507.04391},
  year    = {2025}
}

@article{fallahpour_bioreason_2025,
  title   = {{BioReason}: Incentivizing Multimodal Biological Reasoning within a {DNA}-{LLM} Model},
  author  = {Fallahpour, Adibvafa and Magnuson, Andrew and Gupta, Purav and Ma, Shihao and Naimer, Jack and Shah, Arnav and Duan, Haonan and Ibrahim, Omar and Goodarzi, Hani and Maddison, Chris J. and Wang, Bo},
  journal = {arXiv preprint arXiv: 2505.23579},
  year    = {2025}
}

@article{narayanan_training_2025,
  title   = {Training a Scientific Reasoning Model for Chemistry},
  author  = {Narayanan, Siddharth M. and Braza, James D. and Griffiths, Ryan-Rhys and Bou, Albert and Wellawatte, Geemi and Ramos, Mayk Caldas and Mitchener, Ludovico and Rodriques, Samuel G. and White, Andrew D.},
  journal = {arXiv preprint arXiv: 2506.17238},
  year    = {2025}
}

@article{bigaud_owkinzero_2025,
  title   = {{OwkinZero}: Accelerating Biological Discovery with {AI}},
  author  = {Bigaud, Nathan and Cabeli, Vincent and Gürel, Meltem and Pignet, Arthur and Klein, John and Wainrib, Gilles and Durand, Eric},
  journal = {arXiv preprint arXiv: 2508.16315},
  year    = {2025}
}

@article{istrate_rbio1-training_2025,
  title   = {rbio1-training scientific reasoning {LLMs} with biological world models as soft verifiers},
  author  = {Istrate, Ana-Maria and Milletari, Fausto and Castrotorres, Fabrizio and Tomczak, Jakub M and Torkar, Michaela and Li, Donghui and Karaletsos, Theofanis},
  journal = {bioRxiv 2025.08.18.670981},
  year    = {2025}
}

@misc{hla_pro-1_2025,
  title  = {Pro-1},
  author = {Hla, Michael},
  url    = {https://michaelhla.com/blog/pro1.html},
  year   = {2025}
}

@article{jin_stella_nodate,
  title   = {{STELLA}: Self-Evolving {LLM} Agent for Biomedical Research},
  author  = {Jin, Ruofan and Zhang, Zaixi and Wang, Mengdi and Cong, Le},
  year    = {2025},
  journal = {bioRxiv: 2025.07.01.662467}
}

@article{bunne2024build,
  title     = {How to build the virtual cell with artificial intelligence: Priorities and opportunities},
  author    = {Bunne, Charlotte and Roohani, Yusuf and Rosen, Yanay and Gupta, Ankit and Zhang, Xikun and Roed, Marcel and Alexandrov, Theo and AlQuraishi, Mohammed and Brennan, Patricia and Burkhardt, Daniel B and others},
  journal   = {Cell},
  volume    = {187},
  number    = {25},
  pages     = {7045--7063},
  year      = {2024},
  publisher = {Elsevier}
}

@misc{noutahi_virtual_2025,
  title   = {Virtual Cells: Predict, Explain, Discover},
  author  = {Noutahi, Emmanuel and Hartford, Jason and Tossou, Prudencio and Whitfield, Shawn and Denton, Alisandra K. and Wognum, Cas and Ulicna, Kristina and Craig, Michael and Hsu, Jonathan and Cuccarese, Michael and others},
  year    = {2025},
  journal = {arXiv preprint arXiv: 2505.14613}
}

@misc{ghareeb_robin_2025,
  title   = {Robin: A multi-agent system for automating scientific discovery},
  author  = {Ghareeb, Ali Essam and Chang, Benjamin and Mitchener, Ludovico and Yiu, Angela and Szostkiewicz, Caralyn J. and Laurent, Jon M. and Razzak, Muhammed T. and White, Andrew D. and Hinks, Michaela M. and Rodriques, Samuel G.},
  year    = {2025},
  journal = {arXiv preprint arXiv: 2505.13400}
}

@article{boiko2023autonomous,
  title     = {Autonomous chemical research with large language models},
  author    = {Boiko, Daniil A and MacKnight, Robert and Kline, Ben and Gomes, Gabe},
  journal   = {Nature},
  volume    = {624},
  number    = {7992},
  pages     = {570--578},
  year      = {2023},
  publisher = {Nature Publishing Group UK London}
}

@misc{fang_cell-o1_2025,
  title   = {Cell-o1: Training {LLMs} to Solve Single-Cell Reasoning Puzzles with Reinforcement Learning},
  author  = {Fang, Yin and Jin, Qiao and Xiong, Guangzhi and Jin, Bowen and Zhong, Xianrui and Ouyang, Siru and Zhang, Aidong and Han, Jiawei and Lu, Zhiyong},
  year    = {2025},
  journal = {arXiv preprint arXiv: 2506.02911}
}

@misc{kedzierska_assessing_2023,
  title   = {Assessing the limits of zero-shot foundation models in single-cell biology},
  journal = {Bioinformatics},
  author  = {Kedzierska, Kasia Z. and Crawford, Lorin and Amini, Ava P. and Lu, Alex X.},
  year    = {2023}
}

@article{ahlmann-eltze_deep-learning-based_2025,
  title   = {Deep-learning-based gene perturbation effect prediction does not yet outperform simple linear baselines},
  pages   = {1657--1661},
  volumn  = {1},
  journal = {Nature Methods},
  author  = {Ahlmann-Eltze, Constantin and Huber, Wolfgang and Anders, Simon},
  year    = {2025}
}

@article{rizvi_scaling_2025,
  title   = {Scaling Large Language Models for Next-Generation Single-Cell Analysis},
  author  = {Rizvi, Syed Asad and Levine, Daniel and Patel, Aakash and Zhang, Shiyang and Wang, Eric and He, Sizhuang and Zhang, David and Tang, Cerise and Lyu, Zhuoyang and Darji, Rayyan and Li, Chang and Sun, Emily and Jeong, David and Zhao, Lawrence and Kwan, Jennifer and Braun, David and Hafler, Brian and Ishizuka, Jeffrey and Dhodapkar, Rahul M and Chung, Hattie and Azizi, Shekoofeh and Perozzi, Bryan and van Dijk, David},
  year    = {2025},
  journal = {bioRxiv: 2025.04.14.648850}
}

@article{li2025temporal,
  title   = {Temporal Sampling for Forgotten Reasoning in LLMs},
  author  = {Li, Yuetai and Xu, Zhangchen and Jiang, Fengqing and Ramasubramanian, Bhaskar and Niu, Luyao and Lin, Bill Yuchen and Yue, Xiang and Poovendran, Radha},
  journal = {arXiv preprint arXiv:2505.20196},
  year    = {2025}
}

@article{brixi_genome_2025,
  title   = {Genome modeling and design across all domains of life with Evo 2},
  author  = {Brixi, Garyk and Durrant, Matthew G. and Ku, Jerome and Poli, Michael and Brockman, Greg and Chang, Daniel and Gonzalez, Gabriel A. and King, Samuel H. and Li, David B. and Merchant, Aditi T. and others},
  journal = {bioRxiv 2025.02.18.638918},
  year    = {2025}
}

@article{wang2024rlcoder,
  title   = {Rlcoder: Reinforcement learning for repository-level code completion},
  author  = {Wang, Yanlin and Wang, Yanli and Guo, Daya and Chen, Jiachi and Zhang, Ruikai and Ma, Yuchi and Zheng, Zibin},
  journal = {arXiv preprint arXiv:2407.19487},
  year    = {2024}
}

@article{wang2024repogenreflex,
  title   = {Repogenreflex: Enhancing repository-level code completion with verbal reinforcement and retrieval-augmented generation},
  author  = {Wang, Jicheng and He, Yifeng and Chen, Hao},
  journal = {arXiv preprint arXiv:2409.13122},
  year    = {2024}
}

@article{liu2025ml,
  title   = {Ml-agent: Reinforcing llm agents for autonomous machine learning engineering},
  author  = {Liu, Zexi and Chai, Jingyi and Zhu, Xinyu and Tang, Shuo and Ye, Rui and Zhang, Bo and Bai, Lei and Chen, Siheng},
  journal = {arXiv preprint arXiv:2505.23723},
  year    = {2025}
}

@article{chan2024mle,
  title   = {Mle-bench: Evaluating machine learning agents on machine learning engineering},
  author  = {Chan, Jun Shern and Chowdhury, Neil and Jaffe, Oliver and Aung, James and Sherburn, Dane and Mays, Evan and Starace, Giulio and Liu, Kevin and Maksin, Leon and Patwardhan, Tejal and others},
  journal = {arXiv preprint arXiv:2410.07095},
  year    = {2024}
}

@article{nam2025mle,
  title   = {MLE-STAR: Machine Learning Engineering Agent via Search and Targeted Refinement},
  author  = {Nam, Jaehyun and Yoon, Jinsung and Chen, Jiefeng and Shin, Jinwoo and Ar{\i}k, Sercan {\"O} and Pfister, Tomas},
  journal = {arXiv preprint arXiv:2506.15692},
  year    = {2025}
}

@article{huang2025formarl,
  title   = {FormaRL: Enhancing Autoformalization with no Labeled Data},
  author  = {Huang, Yanxing and Jin, Xinling and Liang, Sijie and Li, Peng and Liu, Yang},
  journal = {arXiv preprint arXiv:2508.18914},
  year    = {2025}
}

@article{chen2025breaking,
  title   = {Breaking the SFT Plateau: Multimodal Structured Reinforcement Learning for Chart-to-Code Generation},
  author  = {Chen, Lei and Zhao, Xuanle and Zeng, Zhixiong and Huang, Jing and Zheng, Liming and Zhong, Yufeng and Ma, Lin},
  journal = {arXiv preprint arXiv:2508.13587},
  year    = {2025}
}

@article{shumailov2024ai,
  title     = {AI models collapse when trained on recursively generated data},
  author    = {Shumailov, Ilia and Shumaylov, Zakhar and Zhao, Yiren and Papernot, Nicolas and Anderson, Ross and Gal, Yarin},
  journal   = {Nature},
  volume    = {631},
  number    = {8022},
  pages     = {755--759},
  year      = {2024},
  publisher = {Nature Publishing Group UK London}
}

@article{villalobos2022will,
  title   = {Will we run out of data? Limits of LLM scaling based on human-generated data},
  author  = {Villalobos, Pablo and Ho, Anson and Sevilla, Jaime and Besiroglu, Tamay and Heim, Lennart and Hobbhahn, Marius},
  journal = {arXiv preprint arXiv:2211.04325},
  year    = {2022}
}

@misc{SemiAnalysis_2025,
  url     = {https://semianalysis.com/2025/06/08/scaling-reinforcement-learning-environments-reward-hacking-agents-scaling-data/},
  journal = {SemiAnalysis},
  year    = {2025},
  month   = {Jul}
} 

@article{wen2025budgetthinker,
  title   = {BudgetThinker: Empowering Budget-aware LLM Reasoning with Control Tokens},
  author  = {Wen, Hao and Wu, Xinrui and Sun, Yi and Zhang, Feifei and Chen, Liye and Wang, Jie and Liu, Yunxin and Zhang, Ya-Qin and Li, Yuanchun},
  journal = {arXiv preprint arXiv:2508.17196},
  year    = {2025}
}

@article{yan2025drqa,
  title   = {DRQA: Dynamic Reasoning Quota Allocation for Controlling Overthinking in Reasoning Large Language Models},
  author  = {Yan, Kaiwen and Shi, Xuanqing and Guo, Hongcheng and Wang, Wenxuan and Zhang, Zhuosheng and Qin, Chengwei},
  journal = {arXiv preprint arXiv:2508.17803},
  year    = {2025}
}

@article{xu2025sspo,
  title   = {SSPO: Self-traced Step-wise Preference Optimization for Process Supervision and Reasoning Compression},
  author  = {Xu, Yuyang and Cheng, Yi and Ying, Haochao and Du, Zhuoyun and Hu, Renjun and Shi, Xing and Lin, Wei and Wu, Jian},
  journal = {arXiv preprint arXiv:2508.12604},
  year    = {2025}
}

@article{zhu2025think,
  title   = {Think in Blocks: Adaptive Reasoning from Direct Response to Deep Reasoning},
  author  = {Zhu, Yekun and Chen, Guang and Mao, Chengjun},
  journal = {arXiv preprint arXiv:2508.15507},
  year    = {2025}
}

@misc{1606.01540,
  author = {Greg Brockman and Vicki Cheung and Ludwig Pettersson and Jonas Schneider and John Schulman and Jie Tang and Wojciech Zaremba},
  title  = {OpenAI Gym},
  year   = {2016},
  eprint = {arXiv:1606.01540}
}

@article{pan2025survey,
  title   = {A Survey of Continual Reinforcement Learning},
  author  = {Pan, Chaofan and Yang, Xin and Li, Yanhua and Wei, Wei and Li, Tianrui and An, Bo and Liang, Jiye},
  journal = {arXiv preprint arXiv:2506.21872},
  year    = {2025}
}

@article{abel2023definition,
  title   = {A definition of continual reinforcement learning},
  author  = {Abel, David and Barreto, Andr{\'e} and Van Roy, Benjamin and Precup, Doina and van Hasselt, Hado P and Singh, Satinder},
  journal = {Advances in Neural Information Processing Systems},
  volume  = {36},
  pages   = {50377--50407},
  year    = {2023}
}

@article{hou2025model,
  title   = {Model context protocol (mcp): Landscape, security threats, and future research directions},
  author  = {Hou, Xinyi and Zhao, Yanjie and Wang, Shenao and Wang, Haoyu},
  journal = {arXiv preprint arXiv:2503.23278},
  year    = {2025}
}

@article{lumer2025scalemcp,
  title   = {ScaleMCP: Dynamic and Auto-Synchronizing Model Context Protocol Tools for LLM Agents},
  author  = {Lumer, Elias and Gulati, Anmol and Subbiah, Vamse Kumar and Basavaraju, Pradeep Honaganahalli and Burke, James A},
  journal = {arXiv preprint arXiv:2505.06416},
  year    = {2025}
}
@article{feng2025efficient,
  title   = {Efficient reasoning models: A survey},
  author  = {Feng, Sicheng and Fang, Gongfan and Ma, Xinyin and Wang, Xinchao},
  journal = {arXiv preprint arXiv:2504.10903},
  year    = {2025}
}

@article{xia2024beyond,
  title   = {Beyond chain-of-thought: A survey of chain-of-x paradigms for llms},
  author  = {Xia, Yu and Wang, Rui and Liu, Xu and Li, Mingyan and Yu, Tong and Chen, Xiang and McAuley, Julian and Li, Shuai},
  journal = {arXiv preprint arXiv:2404.15676},
  year    = {2024}
}


@article{guan2025recall,
  title   = {Recall-Extend Dynamics: Enhancing Small Language Models through Controlled Exploration and Refined Offline Integration},
  author  = {Guan, Zhong and Wu, Likang and Zhao, Hongke and Wang, Jiahui and Wu, Le},
  journal = {arXiv preprint arXiv:2508.16677},
  year    = {2025}
}

@inproceedings{lee2023rlaif,
  title     = {{RLAIF} vs. {RLHF}: Scaling Reinforcement Learning from Human Feedback with {AI} Feedback},
  author    = {Lee, Harrison and Phatale, Samrat and Mansoor, Hassan and Mesnard, Thomas and Ferret, Johan and Lu, Kellie Ren and Bishop, Colton and Hall, Ethan and Carbune, Victor and Rastogi, Abhinav and Prakash, Sushant},
  booktitle = {Proceedings of the 41st International Conference on Machine Learning},
  pages     = {26874--26901},
  year      = {2024},
  editor    = {Salakhutdinov, Ruslan and Kolter, Zico and Heller, Katherine and Weller, Adrian and Oliver, Nuria and Scarlett, Jonathan and Berkenkamp, Felix},
  volume    = {235},
  series    = {Proceedings of Machine Learning Research},
  month     = {21--27 Jul},
  publisher = {PMLR},
  pdf       = {https://raw.githubusercontent.com/mlresearch/v235/main/assets/lee24t/lee24t.pdf},
  url       = {https://proceedings.mlr.press/v235/lee24t.html}
}

@article{chen2025robotwin,
  title   = {RoboTwin 2.0: A Scalable Data Generator and Benchmark with Strong Domain Randomization for Robust Bimanual Robotic Manipulation},
  author  = {Chen, Tianxing and Chen, Zanxin and Chen, Baijun and Cai, Zijian and Liu, Yibin and Liang, Qiwei and Li, Zixuan and Lin, Xianliang and Ge, Yiheng and Gu, Zhenyu and others},
  journal = {arXiv preprint arXiv:2506.18088},
  year    = {2025}
}

@misc{RLinf_repo,
  title        = {RLinf: Reinforcement Learning Infrastructure for Agentic AI},
  author       = {RLinf Team},
  howpublished = {\url{https://github.com/RLinf/RLinf}},
  note         = {GitHub repository},
  year         = {2025}
}

@misc{qwq32b,
  title  = {QwQ-32B: Embracing the Power of Reinforcement Learning},
  url    = {https://qwenlm.github.io/blog/qwq-32b/},
  author = {Qwen Team},
  month  = {March},
  year   = {2025}
}

@misc{seed2025seed-oss,
  author = {ByteDance Seed Team},
  title  = {Seed-OSS Open-Source Models},
  year   = {2025},
  url    = {https://github.com/ByteDance-Seed/seed-oss}
}

@misc{seed_seed15-thinking_2025,
  title   = {Seed1.5-Thinking: Advancing Superb Reasoning Models with Reinforcement Learning},
  author  = {Seed, {ByteDance} and Chen, Jiaze and Fan, Tiantian and Liu, Xin and Liu, Lingjun and Lin, Zhiqi and Wang, Mingxuan and Wang, Chengyi and Wei, Xiangpeng and Xu, Wenyuan and others},
  journal = {arXiv preprint arXiv: 2504.13914},
  year    = {2025}
}

@misc{zhu_internvl3_2025,
  title   = {{InternVL}3: Exploring Advanced Training and Test-Time Recipes for Open-Source Multimodal Models},
  author  = {Zhu, Jinguo and Wang, Weiyun and Chen, Zhe and Liu, Zhaoyang and Ye, Shenglong and Gu, Lixin and Tian, Hao and Duan, Yuchen and Su, Weijie and Shao, Jie and others},
  journal = {arXiv preprint arXiv: 2504.10479},
  year    = {2025}
}

@misc{wang_internvl35_2025,
  title   = {{InternVL}3.5: Advancing Open-Source Multimodal Models in Versatility, Reasoning, and Efficiency},
  author  = {Wang, Weiyun and Gao, Zhangwei and Gu, Lixin and Pu, Hengjun and Cui, Long and Wei, Xingguang and Liu, Zhaoyang and Jing, Linglin and Ye, Shenglong and Shao, Jie and others},
  journal = {arXiv preprint arXiv: 2508.18265},
  year    = {2025}
}

@misc{wang_skywork_2025,
  title   = {Skywork R1V2: Multimodal Hybrid Reinforcement Learning for Reasoning},
  author  = {Wang, Peiyu and Wei, Yichen and Peng, Yi and Wang, Xiaokun and Qiu, Weijie and Shen, Wei and Xie, Tianyidan and Pei, Jiangbo and Zhang, Jianhao and Hao, Yunzhuo and others},
  journal = {arXiv preprint arXiv: 2504.16656},
  year    = {2025}
}

@misc{team_glm-45v_2025,
  title   = {{GLM}-4.5V and {GLM}-4.1V-Thinking: Towards Versatile Multimodal Reasoning with Scalable Reinforcement Learning},
  author  = {Team, {GLM}-V. and Hong, Wenyi and Yu, Wenmeng and Gu, Xiaotao and Wang, Guo and Gan, Guobing and Tang, Haomiao and Cheng, Jiale and Qi, Ji and Ji, Junhui and others},
  journal = {arXiv preprint arXiv: 2507.01006},
  year    = {2025}
}

@misc{qvq,
  title  = {QVQ: To See the World with Wisdom},
  url    = {https://qwenlm.github.io/blog/qvq-72b-preview},
  author = {{Qwen Team}},
  year   = {2025}
}

@misc{anthropic2025claude41,
  title  = {Claude Opus 4.1},
  url    = {https://www.anthropic.com/claude/opus},
  author = {{Anthropic}},
  year   = {2025}
}

@article{lambert2024tulu,
  title   = {Tulu 3: Pushing frontiers in open language model post-training},
  author  = {Lambert, Nathan and Morrison, Jacob and Pyatkin, Valentina and Huang, Shengyi and Ivison, Hamish and Brahman, Faeze and Miranda, Lester James V and Liu, Alisa and Dziri, Nouha and Lyu, Shane and others},
  journal = {arXiv preprint arXiv:2411.15124},
  year    = {2024}
}

@article{chen2025r1,
  title   = {R1-Code-Interpreter: Training LLMs to Reason with Code via Supervised and Reinforcement Learning},
  author  = {Chen, Yongchao and Liu, Yueying and Zhou, Junwei and Hao, Yilun and Wang, Jingquan and Zhang, Yang and Fan, Chuchu},
  journal = {arXiv preprint arXiv:2505.21668},
  year    = {2025}
}

@article{grattafiori2024llama,
  title   = {The llama 3 herd of models},
  author  = {Grattafiori, Aaron and Dubey, Abhimanyu and Jauhri, Abhinav and Pandey, Abhinav and Kadian, Abhishek and Al-Dahle, Ahmad and Letman, Aiesha and Mathur, Akhil and Schelten, Alan and Vaughan, Alex and others},
  journal = {arXiv preprint arXiv:2407.21783},
  year    = {2024}
}

@article{hu2024minicpm,
  title   = {Minicpm: Unveiling the potential of small language models with scalable training strategies},
  author  = {Hu, Shengding and Tu, Yuge and Han, Xu and He, Chaoqun and Cui, Ganqu and Long, Xiang and Zheng, Zhi and Fang, Yewei and Huang, Yuxiang and Zhao, Weilin and others},
  journal = {arXiv preprint arXiv:2404.06395},
  year    = {2024}
}

@article{olmo20242,
  title   = {2 OLMo 2 Furious},
  author  = {OLMo, Team and Walsh, Pete and Soldaini, Luca and Groeneveld, Dirk and Lo, Kyle and Arora, Shane and Bhagia, Akshita and Gu, Yuling and Huang, Shengyi and Jordan, Matt and others},
  journal = {arXiv preprint arXiv:2501.00656},
  year    = {2024}
}

@article{parmar2024reuse,
  title   = {Reuse, don't retrain: A recipe for continued pretraining of language models},
  author  = {Parmar, Jupinder and Satheesh, Sanjev and Patwary, Mostofa and Shoeybi, Mohammad and Catanzaro, Bryan},
  journal = {arXiv preprint arXiv:2407.07263},
  year    = {2024}
}

@article{deng2025trial,
  title   = {From Trial-and-Error to Improvement: A Systematic Analysis of LLM Exploration Mechanisms in RLVR},
  author  = {Deng, Jia and Chen, Jie and Chen, Zhipeng and Cheng, Daixuan and Bai, Fei and Zhang, Beichen and Min, Yinqian and Gao, Yanzipeng and Zhao, Wayne Xin and Wen, Ji-Rong},
  journal = {arXiv preprint arXiv:2508.07534},
  year    = {2025}
}

@article{yuan2025native,
  title   = {Native sparse attention: Hardware-aligned and natively trainable sparse attention},
  author  = {Yuan, Jingyang and Gao, Huazuo and Dai, Damai and Luo, Junyu and Zhao, Liang and Zhang, Zhengyan and Xie, Zhenda and Wei, YX and Wang, Lean and Xiao, Zhiping and others},
  journal = {arXiv preprint arXiv:2502.11089},
  year    = {2025}
}

@article{wang2025step,
  title   = {Step-3 is Large yet Affordable: Model-system Co-design for Cost-effective Decoding},
  author  = {Wang, Bin and Wang, Bojun and Wan, Changyi and Huang, Guanzhe and Hu, Hanpeng and Jia, Haonan and Nie, Hao and Li, Mingliang and Chen, Nuo and Chen, Siyu and others},
  journal = {arXiv preprint arXiv:2507.19427},
  year    = {2025}
}

@article{shazeer2017outrageously,
  title   = {Outrageously large neural networks: The sparsely-gated mixture-of-experts layer},
  author  = {Shazeer, Noam and Mirhoseini, Azalia and Maziarz, Krzysztof and Davis, Andy and Le, Quoc and Hinton, Geoffrey and Dean, Jeff},
  journal = {arXiv preprint arXiv:1701.06538},
  year    = {2017}
}

@article{vaswani2017attention,
  title   = {Attention is all you need},
  author  = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
  journal = {Advances in neural information processing systems},
  volume  = {30},
  year    = {2017}
}

@article{dai2024deepseekmoe,
  title   = {Deepseekmoe: Towards ultimate expert specialization in mixture-of-experts language models},
  author  = {Dai, Damai and Deng, Chengqi and Zhao, Chenggang and Xu, RX and Gao, Huazuo and Chen, Deli and Li, Jiashi and Zeng, Wangding and Yu, Xingkai and Wu, Yu and others},
  journal = {arXiv preprint arXiv:2401.06066},
  year    = {2024}
}

@article{jiang2024mixtral,
  title   = {Mixtral of experts},
  author  = {Jiang, Albert Q and Sablayrolles, Alexandre and Roux, Antoine and Mensch, Arthur and Savary, Blanche and Bamford, Chris and Chaplot, Devendra Singh and Casas, Diego de las and Hanna, Emma Bou and Bressand, Florian and others},
  journal = {arXiv preprint arXiv:2401.04088},
  year    = {2024}
}

@article{zoph2016neural,
  title   = {Neural architecture search with reinforcement learning},
  author  = {Zoph, Barret and Le, Quoc V},
  journal = {arXiv preprint arXiv:1611.01578},
  year    = {2016}
}

@article{han2021dynamic,
  title     = {Dynamic neural networks: A survey},
  author    = {Han, Yizeng and Huang, Gao and Song, Shiji and Yang, Le and Wang, Honghui and Wang, Yulin},
  journal   = {IEEE transactions on pattern analysis and machine intelligence},
  volume    = {44},
  number    = {11},
  pages     = {7436--7456},
  year      = {2021},
  publisher = {IEEE}
}

@article{shengyi2022the37implementation,
  title   = {The 37 implementation details of proximal policy optimization},
  author  = {Huang, Shengyi and Dossa, Rousslan Fernand Julien and Raffin, Antonin and Kanervisto, Anssi and Wang, Weixun},
  journal = {The ICLR Blog Track 2023},
  year    = {2022}
}

@misc{li2025jointlyreinforcingdiversityquality,
  title         = {Jointly Reinforcing Diversity and Quality in Language Model Generations},
  author        = {Tianjian Li and Yiming Zhang and Ping Yu and Swarnadeep Saha and Daniel Khashabi and Jason Weston and Jack Lanchantin and Tianlu Wang},
  year          = {2025},
  eprint        = {2509.02534},
  archiveprefix = {arXiv},
  primaryclass  = {cs.CL},
  url           = {https://arxiv.org/abs/2509.02534}
}

@article{li2025chain,
  title   = {Chain-of-Agents: End-to-End Agent Foundation Models via Multi-Agent Distillation and Agentic RL},
  author  = {Li, Weizhen and Lin, Jianbo and Jiang, Zhuosong and Cao, Jingyi and Liu, Xinpeng and Zhang, Jiayu and Huang, Zhenqiang and Chen, Qianben and Sun, Weichen and Wang, Qiexiang and others},
  journal = {arXiv preprint arXiv:2508.13167},
  year    = {2025}
}

@misc{zhang2025landscapeagenticreinforcementlearning,
  title         = {The Landscape of Agentic Reinforcement Learning for LLMs: A Survey},
  author        = {Guibin Zhang and Hejia Geng and Xiaohang Yu and Zhenfei Yin and Zaibin Zhang and Zelin Tan and Heng Zhou and Zhongzhi Li and Xiangyuan Xue and Yijiang Li and Yifan Zhou and Yang Chen and Chen Zhang and Yutao Fan and Zihu Wang and Songtao Huang and Yue Liao and Hongru Wang and Mengyue Yang and Heng Ji and Michael Littman and Jun Wang and Shuicheng Yan and Philip Torr and Lei Bai},
  year          = {2025},
  eprint        = {2509.02547},
  archiveprefix = {arXiv},
  primaryclass  = {cs.AI},
  url           = {https://arxiv.org/abs/2509.02547}
}

@article{sutton1999policy,
  title   = {Policy gradient methods for reinforcement learning with function approximation},
  author  = {Sutton, Richard S and McAllester, David and Singh, Satinder and Mansour, Yishay},
  journal = {Advances in neural information processing systems},
  volume  = {12},
  year    = {1999}
}

@misc{gan2025cotspacetheoreticalframeworkinternal,
  title         = {CoT-Space: A Theoretical Framework for Internal Slow-Thinking via Reinforcement Learning},
  author        = {Zeyu Gan and Hao Yi and Yong Liu},
  year          = {2025},
  eprint        = {2509.04027},
  archiveprefix = {arXiv},
  primaryclass  = {cs.AI},
  url           = {https://arxiv.org/abs/2509.04027}
}

@article{qu2025survey,
  title   = {A survey of efficient reasoning for large reasoning models: Language, multimodality, and beyond},
  author  = {Qu, Xiaoye and Li, Yafu and Su, Zhaochen and Sun, Weigao and Yan, Jianhao and Liu, Dongrui and Cui, Ganqu and Liu, Daizong and Liang, Shuxian and He, Junxian and others},
  journal = {arXiv preprint arXiv:2503.21614},
  year    = {2025}
}

@article{perolat2022mastering,
  title     = {Mastering the game of Stratego with model-free multiagent reinforcement learning},
  author    = {Perolat, Julien and De Vylder, Bart and Hennes, Daniel and Tarassov, Eugene and Strub, Florian and de Boer, Vincent and Muller, Paul and Connor, Jerome T and Burch, Neil and Anthony, Thomas and others},
  journal   = {Science},
  volume    = {378},
  number    = {6623},
  pages     = {990--996},
  year      = {2022},
  publisher = {American Association for the Advancement of Science}
}

@misc{yuan2025llms,
  author       = {Lifan Yuan and Weize Chen and Yuchen Zhang and Ganqu Cui and Hanbin Wang and Ziming You and Ning Ding and Zhiyuan Liu and Maosong Sun and Hao Peng},
  title        = {From f(x) and g(x) to f(g(x)): {LLMs} Learn New Skills in {RL} by Composing Old Ones},
  year         = {2025},
  note         = {Notion blog post, available online},
  howpublished = {\url{https://husky-morocco-f72.notion.site/From-f-x-and-g-x-to-f-g-x-LLMs-Learn-New-Skills-in-RL-by-Composing-Old-Ones-2499aba4486f802c8108e76a12af3020}}
}

@article{wang2025ui,
  title={UI-TARS-2 Technical Report: Advancing GUI Agent with Multi-Turn Reinforcement Learning},
  author={Wang, Haoming and Zou, Haoyang and Song, Huatong and Feng, Jiazhan and Fang, Junjie and Lu, Junting and Liu, Longxiang and Luo, Qinyu and Liang, Shihao and Huang, Shijue and others},
  journal={arXiv preprint arXiv:2509.02544},
  year={2025}
}

@article{qin2025ui,
  title={Ui-tars: Pioneering automated gui interaction with native agents},
  author={Qin, Yujia and Ye, Yining and Fang, Junjie and Wang, Haoming and Liang, Shihao and Tian, Shizuo and Zhang, Junda and Li, Jiahao and Li, Yunxin and Huang, Shijue and others},
  journal={arXiv preprint arXiv:2501.12326},
  year={2025}
}

@article{chen2021evaluating,
  title={Evaluating large language models trained on code},
  author={Chen, Mark and Tworek, Jerry and Jun, Heewoo and Yuan, Qiming and Pinto, Henrique Ponde De Oliveira and Kaplan, Jared and Edwards, Harri and Burda, Yuri and Joseph, Nicholas and Brockman, Greg and others},
  journal={arXiv preprint arXiv:2107.03374},
  year={2021}
}

@article{austin2021program,
  title={Program synthesis with large language models},
  author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},
  journal={arXiv preprint arXiv:2108.07732},
  year={2021}
}

@article{hendrycks2021measuring,
  title={Measuring coding challenge competence with apps},
  author={Hendrycks, Dan and Basart, Steven and Kadavath, Saurav and Mazeika, Mantas and Arora, Akul and Guo, Ethan and Burns, Collin and Puranik, Samir and He, Horace and Song, Dawn and others},
  journal={arXiv preprint arXiv:2105.09938},
  year={2021}
}

@article{chen2025bridging,
  title={Bridging supervised learning and reinforcement learning in math reasoning},
  author={Chen, Huayu and Zheng, Kaiwen and Zhang, Qinsheng and Cui, Ganqu and Cui, Yin and Ye, Haotian and Lin, Tsung-Yi and Liu, Ming-Yu and Zhu, Jun and Wang, Haoxiang},
  journal={arXiv preprint arXiv:2505.18116},
  year={2025}
}

@article{ionides2008truncated,
  title={Truncated importance sampling},
  author={Ionides, Edward L},
  journal={Journal of Computational and Graphical Statistics},
  volume={17},
  number={2},
  pages={295--311},
  year={2008},
  publisher={Taylor \& Francis}
}

@article{shenfeld2025rl,
  title={RL's Razor: Why Online Reinforcement Learning Forgets Less},
  author={Shenfeld, Idan and Pari, Jyothish and Agrawal, Pulkit},
  journal={arXiv preprint arXiv:2509.04259},
  year={2025}
}

@article{zhao2025geometric,
  title={Geometric-Mean Policy Optimization},
  author={Zhao, Yuzhong and Liu, Yue and Liu, Junpeng and Chen, Jingye and Wu, Xun and Hao, Yaru and Lv, Tengchao and Huang, Shaohan and Cui, Lei and Ye, Qixiang and others},
  journal={arXiv preprint arXiv:2507.20673},
  year={2025}
}

@article{zhang2025policy,
  title={On-policy rl meets off-policy experts: Harmonizing supervised fine-tuning and reinforcement learning via dynamic weighting},
  author={Zhang, Wenhao and Xie, Yuexiang and Sun, Yuchang and Chen, Yanxi and Wang, Guoyin and Li, Yaliang and Ding, Bolin and Zhou, Jingren},
  journal={arXiv preprint arXiv:2508.11408},
  year={2025}
}

@article{lv2025towards,
  title={Towards a Unified View of Large Language Model Post-Training},
  author={Lv, Xingtai and Zuo, Yuxin and Sun, Youbang and Liu, Hongyi and Wei, Yuntian and Chen, Zhekai and He, Lixuan and Zhu, Xuekai and Zhang, Kaiyan and Wang, Bingning and others},
  journal={arXiv preprint arXiv:2509.04419},
  year={2025}
}

@article{zhang2025stephint,
  title={StepHint: Multi-level Stepwise Hints Enhance Reinforcement Learning to Reason},
  author={Zhang, Kaiyi and Lv, Ang and Li, Jinpeng and Wang, Yongbo and Wang, Feng and Hu, Haoyuan and Yan, Rui},
  journal={arXiv preprint arXiv:2507.02841},
  year={2025}
}

@article{rajani2025scalpel,
  title={Scalpel vs. Hammer: GRPO Amplifies Existing Capabilities, SFT Replaces Them},
  author={Rajani, Neel and Gema, Aryo Pradipta and Goldfarb-Tarrant, Seraphina and Titov, Ivan},
  journal={arXiv preprint arXiv:2507.10616},
  year={2025}
}

@article{xiao2025connection,
  title={On a connection between imitation learning and RLHF},
  author={Xiao, Teng and Yuan, Yige and Li, Mingxiao and Chen, Zhengyu and Honavar, Vasant G},
  journal={arXiv preprint arXiv:2503.05079},
  year={2025}
}

@article{sun2024supervised,
  title={Supervised fine-tuning as inverse reinforcement learning},
  author={Sun, Hao},
  journal={arXiv preprint arXiv:2403.12017},
  year={2024}
}

@article{ji2024towards,
  title={Towards efficient exact optimization of language model alignment},
  author={Ji, Haozhe and Lu, Cheng and Niu, Yilin and Ke, Pei and Wang, Hongning and Zhu, Jun and Tang, Jie and Huang, Minlie},
  journal={arXiv preprint arXiv:2402.00856},
  year={2024}
}

@misc{chen2025twostagetrainingcooperativesft,
      title={Beyond Two-Stage Training: Cooperative SFT and RL for LLM Reasoning}, 
      author={Liang Chen and Xueting Han and Li Shen and Jing Bai and Kam-Fai Wong},
      year={2025},
      eprint={2509.06948},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2509.06948}, 
}

@misc{ernie2025technicalreport,
      title={ERNIE 4.5 Technical Report},
      author={Baidu},
      year={2025},
      primaryClass={cs.CL},
      howpublished={\url{https://ernie.baidu.com/blog/publication/ERNIE_Technical_Report.pdf}}
}

@article{yu2022surprising,
  title={The surprising effectiveness of ppo in cooperative multi-agent games},
  author={Yu, Chao and Velu, Akash and Vinitsky, Eugene and Gao, Jiaxuan and Wang, Yu and Bayen, Alexandre and Wu, Yi},
  journal={Advances in neural information processing systems},
  volume={35},
  pages={24611--24624},
  year={2022}
}

@article{sunehag2017value,
  title={Value-decomposition networks for cooperative multi-agent learning},
  author={Sunehag, Peter and Lever, Guy and Gruslys, Audrunas and Czarnecki, Wojciech Marian and Zambaldi, Vinicius and Jaderberg, Max and Lanctot, Marc and Sonnerat, Nicolas and Leibo, Joel Z and Tuyls, Karl and others},
  journal={arXiv preprint arXiv:1706.05296},
  year={2017}
}

@inproceedings{yang2018mean,
  title={Mean field multi-agent reinforcement learning},
  author={Yang, Yaodong and Luo, Rui and Li, Minne and Zhou, Ming and Zhang, Weinan and Wang, Jun},
  booktitle={International conference on machine learning},
  pages={5571--5580},
  year={2018},
  organization={PMLR}
}

@article{intelligence2025pi_,
  title={pi 0.5: a Vision-Language-Action Model with Open-World Generalization},
  author={Intelligence, Physical and Black, Kevin and Brown, Noah and Darpinian, James and Dhabalia, Karan and Driess, Danny and Esmail, Adnan and Equi, Michael and Finn, Chelsea and Fusai, Niccolo and others},
  journal={arXiv preprint arXiv:2504.16054},
  year={2025}
}

@article{peng2018deepmimic,
  title={Deepmimic: Example-guided deep reinforcement learning of physics-based character skills},
  author={Peng, Xue Bin and Abbeel, Pieter and Levine, Sergey and Van de Panne, Michiel},
  journal={ACM Transactions On Graphics (TOG)},
  volume={37},
  number={4},
  pages={1--14},
  year={2018},
  publisher={ACM New York, NY, USA}
}

@article{hwangbo2019learning,
  title={Learning agile and dynamic motor skills for legged robots},
  author={Hwangbo, Jemin and Lee, Joonho and Dosovitskiy, Alexey and Bellicoso, Dario and Tsounis, Vassilios and Koltun, Vladlen and Hutter, Marco},
  journal={Science Robotics},
  volume={4},
  number={26},
  pages={eaau5872},
  year={2019},
  publisher={American Association for the Advancement of Science}
}

@article{chen2023bi,
  title={Bi-dexhands: Towards human-level bimanual dexterous manipulation},
  author={Chen, Yuanpei and Geng, Yiran and Zhong, Fangwei and Ji, Jiaming and Jiang, Jiechuang and Lu, Zongqing and Dong, Hao and Yang, Yaodong},
  journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
  volume={46},
  number={5},
  pages={2804--2818},
  year={2023},
  publisher={IEEE}
}

@article{liu2025webexplorerexploreevolvetraining,
  title={Webexplorer: Explore and evolve for training long-horizon web agents},
  author={Liu, Junteng and Li, Yunji and Zhang, Chi and Li, Jingyang and Chen, Aili and Ji, Ke and Cheng, Weiyu and Wu, Zijia and Du, Chengyu and Xu, Qidi and others},
  journal={arXiv preprint arXiv:2509.06501},
  year={2025}
}

@article{nguyen2025sfrdeepresearcheffectivereinforcementlearning,
  title={Sfr-deepresearch: Towards effective reinforcement learning for autonomously reasoning single agents},
  author={Nguyen, Xuan-Phi and Pandit, Shrey and Reddy, Revanth Gangi and Xu, Austin and Savarese, Silvio and Xiong, Caiming and Joty, Shafiq},
  journal={arXiv preprint arXiv:2509.06283},
  year={2025}
}

@article{yang2025reinforcement,
  title={Reinforcement Learning for Machine Learning Engineering Agents},
  author={Yang, Sherry and He-Yueya, Joy and Liang, Percy},
  journal={arXiv preprint arXiv:2509.01684},
  year={2025}
}

@article{schmied2025llms,
  title={Llms are greedy agents: Effects of rl fine-tuning on decision-making abilities},
  author={Schmied, Thomas and Bornschein, J{\"o}rg and Grau-Moya, Jordi and Wulfmeier, Markus and Pascanu, Razvan},
  journal={arXiv preprint arXiv:2504.16078},
  year={2025}
}

@article{he2025nondeterminism,
  author = {Horace He and Thinking Machines Lab},
  title = {Defeating Nondeterminism in LLM Inference},
  journal = {Thinking Machines Lab: Connectionism},
  year = {2025},
  note = {https://thinkingmachines.ai/blog/defeating-nondeterminism-in-llm-inference/},
  doi = {10.64434/tml.20250910}
}

@misc{lu2025learning,
    title={Learning to Optimize Multi-Objective Alignment Through Dynamic Reward Weighting},
    author={Yining Lu and Zilong Wang and Shiyang Li and Xin Liu and Changlong Yu and Qingyu Yin and Zhan Shi and Zixuan Zhang and Meng Jiang},
    year={2025},
    eprint={2509.11452},
    archivePrefix={arXiv},
    primaryClass={cs.LG}
}

@article{li2025gradient,
  title={Gradient-Adaptive Policy Optimization: Towards Multi-Objective Alignment of Large Language Models},
  author={Li, Chengao and Zhang, Hanyu and Xu, Yunkun and Xue, Hongyan and Ao, Xiang and He, Qing},
  journal={arXiv preprint arXiv:2507.01915},
  year={2025}
}

@article{liu2024stochastic,
  title={The stochastic multi-gradient algorithm for multi-objective optimization and its application to supervised machine learning},
  author={Liu, Suyun and Vicente, Luis Nunes},
  journal={Annals of Operations Research},
  volume={339},
  number={3},
  pages={1119--1148},
  year={2024},
  publisher={Springer}
}

@article{zhang2025grpo,
  title={Grpo-lead: A difficulty-aware reinforcement learning approach for concise mathematical reasoning in language models},
  author={Zhang, Jixiao and Zuo, Chunsheng},
  journal={arXiv preprint arXiv:2504.09696},
  year={2025}
}

@article{zheng2025parallel,
  title={Parallel-R1: Towards Parallel Thinking via Reinforcement Learning},
  author={Zheng, Tong and Zhang, Hongming and Yu, Wenhao and Wang, Xiaoyang and Yang, Xinyu and Dai, Runpeng and Liu, Rui and Bao, Huiwen and Huang, Chengsong and Huang, Heng and others},
  journal={arXiv preprint arXiv:2509.07980},
  year={2025}
}

@article{feng2025group,
  title={Group-in-group policy optimization for llm agent training},
  author={Feng, Lang and Xue, Zhenghai and Liu, Tingcong and An, Bo},
  journal={arXiv preprint arXiv:2505.10978},
  year={2025}
}

@article{team2025longcat,
  title={LongCat-Flash Technical Report},
  author={Team, Meituan LongCat and Li, Bei and Lei, Bingye and Wang, Bo and Rong, Bolin and Wang, Chao and Zhang, Chao and Gao, Chen and Zhang, Chen and Sun, Cheng and others},
  journal={arXiv preprint arXiv:2509.01322},
  year={2025}
}

@article{jha2025proof2silicon,
  title={Proof2Silicon: Prompt Repair for Verified Code and Hardware Generation via Reinforcement Learning},
  author={Jha, Manvi and Wan, Jiaxin and Chen, Deming},
  journal={arXiv preprint arXiv:2509.06239},
  year={2025}
}

@article{lee2025learning,
  title={Learning to Generate Unit Test via Adversarial Reinforcement Learning},
  author={Lee, Dongjun and Hwang, Changho and Lee, Kimin},
  journal={arXiv preprint arXiv:2508.21107},
  year={2025}
}

@article{yeh2025ar,
  title={AR2: Adversarial Reinforcement Learning for Abstract Reasoning in Large Language Models},
  author={Yeh, Cheng-Kai and Lee, Hsing-Wang and Kuo, Chung-Hung and Huang, Hen-Hsen},
  journal={arXiv preprint arXiv:2509.03537},
  year={2025}
}

@article{sinha2025illusion,
  title={The Illusion of Diminishing Returns: Measuring Long Horizon Execution in LLMs},
  author={Sinha, Akshit and Arun, Arvindh and Goel, Shashwat and Staab, Steffen and Geiping, Jonas},
  journal={arXiv preprint arXiv:2509.09677},
  year={2025}
}

@article{paprunia2025advancing,
  title={Advancing SLM Tool-Use Capability using Reinforcement Learning},
  author={Paprunia, Dhruvi and Kharidia, Vansh and Doshi, Pankti},
  journal={arXiv preprint arXiv:2509.04518},
  year={2025}
}

@article{xue2025simpletir,
  title={Simpletir: End-to-end reinforcement learning for multi-turn tool-integrated reasoning},
  author={Xue, Zhenghai and Zheng, Longtao and Liu, Qian and Li, Yingru and Zheng, Xiaosen and Ma, Zejun and An, Bo},
  journal={arXiv preprint arXiv:2509.02479},
  year={2025}
}

@article{li2025encouraging,
  title={Encouraging Good Processes Without the Need for Good Answers: Reinforcement Learning for LLM Agent Planning},
  author={Li, Zhiwei and Hu, Yong and Wang, Wenqing},
  journal={arXiv preprint arXiv:2508.19598},
  year={2025}
}

@article{zhang2025group,
  title={Group Expectation Policy Optimization for Stable Heterogeneous Reinforcement Learning in LLMs},
  author={Zhang, Han and Zheng, Ruibin and Yi, Zexuan and Peng, Hanyang and Wang, Hui and Yu, Yue},
  journal={arXiv preprint arXiv:2508.17850},
  year={2025}
}

@misc{xu2025singlestream,
    title={Single-stream Policy Optimization},
    author={Zhongwen Xu and Zihan Ding},
    year={2025},
    eprint={2509.13232},
    archivePrefix={arXiv},
    primaryClass={cs.LG}
}

@misc{jayalath2025compute,
    title={Compute as Teacher: Turning Inference Compute Into Reference-Free Supervision},
    author={Dulhan Jayalath and Shashwat Goel and Thomas Foster and Parag Jain and Suchin Gururangan and Cheng Zhang and Anirudh Goyal and Alan Schelten},
    year={2025},
    eprint={2509.14234},
    archivePrefix={arXiv},
    primaryClass={cs.LG}
}

@misc{su2025klearreasoner,
    title={Klear-Reasoner: Advancing Reasoning Capability via Gradient-Preserving Clipping Policy Optimization},
    author={Zhenpeng Su and Leiyu Pan and Xue Bai and Dening Liu and Guanting Dong and Jiaming Huang and Wenping Hu and Fuzheng Zhang and Kun Gai and Guorui Zhou},
    year={2025},
    eprint={2508.07629},
    archivePrefix={arXiv},
    primaryClass={cs.LG}
}

@article{zhu2025flowrl,
  title={FlowRL: Matching Reward Distributions for LLM Reasoning},
  author={Zhu, Xuekai and Cheng, Daixuan and Zhang, Dinghuai and Li, Hengli and Zhang, Kaiyan and Jiang, Che and Sun, Youbang and Hua, Ermo and Zuo, Yuxin and Lv, Xingtai and others},
  journal={arXiv preprint arXiv:2509.15207},
  year={2025}
}

@article{wu2025mirage,
  title={Mirage or Method? How Model-Task Alignment Induces Divergent RL Conclusions},
  author={Wu, Haoze and Wang, Cheng and Zhao, Wenshuo and He, Junxian},
  journal={arXiv e-prints},
  pages={arXiv--2508},
  year={2025}
}

@article{dwyer2025s,
  title={It's Not You, It's Clipping: A Soft Trust-Region via Probability Smoothing for LLM RL},
  author={Dwyer, Madeleine and Sobey, Adam and Chapman, Adriane},
  journal={arXiv preprint arXiv:2509.21282},
  year={2025}
}

@article{ji2025tree,
  title={Tree Search for LLM Agent Reinforcement Learning},
  author={Ji, Yuxiang and Ma, Ziyu and Wang, Yong and Chen, Guanhua and Chu, Xiangxiang and Wu, Liaoni},
  journal={arXiv preprint arXiv:2509.21240},
  year={2025}
}

@article{matsutani2025rl,
  title={RL Squeezes, SFT Expands: A Comparative Study of Reasoning LLMs},
  author={Matsutani, Kohsei and Takashiro, Shota and Minegishi, Gouki and Kojima, Takeshi and Iwasawa, Yusuke and Matsuo, Yutaka},
  journal={arXiv preprint arXiv:2509.21128},
  year={2025}
}

@article{li2025reinforcement,
  title={Reinforcement Learning on Pre-Training Data},
  author={Li, Siheng and Li, Kejiao and Xu, Zenan and Huang, Guanhua and Yang, Evander and Li, Kun and Wu, Haoyuan and Wu, Jiajia and Zheng, Zihao and Zhang, Chenchen and others},
  journal={arXiv preprint arXiv:2509.19249},
  year={2025}
}

@article{zhou2025april,
  title={APRIL: Active Partial Rollouts in Reinforcement Learning to tame long-tail generation},
  author={Zhou, Yuzhen and Li, Jiajun and Su, Yusheng and Ramesh, Gowtham and Zhu, Zilin and Long, Xiang and Zhao, Chenyang and Pan, Jin and Yu, Xiaodong and Wang, Ze and others},
  journal={arXiv preprint arXiv:2509.18521},
  year={2025}
}

@article{lacombe2025reasoning,
  title={Reasoning Core: A Scalable RL Environment for LLM Symbolic Reasoning},
  author={Lacombe, Valentin and Quesnel, Valentin and Sileo, Damien},
  journal={arXiv preprint arXiv:2509.18083},
  year={2025}
}

@article{ling2025table2latex,
  title={Table2LaTeX-RL: High-Fidelity LaTeX Code Generation from Table Images via Reinforced Multimodal Language Models},
  author={Ling, Jun and Qi, Yao and Huang, Tao and Zhou, Shibo and Huang, Yanqin and Yang, Jiang and Song, Ziqi and Zhou, Ying and Yang, Yang and Shen, Heng Tao and others},
  journal={arXiv preprint arXiv:2509.17589},
  year={2025}
}

@article{du2025generalizable,
  title={Generalizable End-to-End Tool-Use RL with Synthetic CodeGym},
  author={Du, Weihua and Gong, Hailei and Ling, Zhan and Liu, Kang and Shen, Lingfeng and Yao, Xuesong and Xu, Yufei and Shi, Dingyuan and Yang, Yiming and Chen, Jiecao},
  journal={arXiv preprint arXiv:2509.17325},
  year={2025}
}

@article{tan2025process,
  title={Process-Supervised Reinforcement Learning for Interactive Multimodal Tool-Use Agents},
  author={Tan, Weiting and Qu, Xinghua and Tu, Ming and Ge, Meng and Liu, Andy T and Koehn, Philipp and Lu, Lu},
  journal={arXiv preprint arXiv:2509.14480},
  year={2025}
}

@article{bhaskar2025language,
  title={Language Models that Think, Chat Better},
  author={Bhaskar, Adithya and Ye, Xi and Chen, Danqi},
  journal={arXiv preprint arXiv:2509.20357},
  year={2025}
}

@article{zhou2025evolving,
  title={Evolving Language Models without Labels: Majority Drives Selection, Novelty Promotes Variation},
  author={Zhou, Yujun and Liang, Zhenwen and Liu, Haolin and Yu, Wenhao and Panaganti, Kishan and Song, Linfeng and Yu, Dian and Zhang, Xiangliang and Mi, Haitao and Yu, Dong},
  journal={arXiv preprint arXiv:2509.15194},
  year={2025}
}

@article{xing2025caprl,
  title={CapRL: Stimulating Dense Image Caption Capabilities via Reinforcement Learning},
  author={Xing, Long and Dong, Xiaoyi and Zang, Yuhang and Cao, Yuhang and Liang, Jianze and Huang, Qidong and Wang, Jiaqi and Wu, Feng and Lin, Dahua},
  journal={arXiv preprint arXiv:2509.22647},
  year={2025}
}

@article{zhang2025tdrm,
  title={TDRM: Smooth Reward Models with Temporal Difference for LLM RL and Inference},
  author={Zhang, Dan and Cai, Min and Li, Jonathan and Hu, Ziniu and Yue, Yisong and Dong, Yuxiao and Tang, Jie},
  journal={arXiv preprint arXiv:2509.15110},
  year={2025}
}

@article{liu2025fleming,
  title={Fleming-R1: Toward Expert-Level Medical Reasoning via Reinforcement Learning},
  author={Liu, Chi and Li, Derek and Shu, Yan and Chen, Robin and Duan, Derek and Fang, Teng and Dai, Bryan},
  journal={arXiv preprint arXiv:2509.15279},
  year={2025}
}

@article{yu2025minicpm,
  title={MiniCPM-V 4.5: Cooking Efficient MLLMs via Architecture, Data, and Training Recipe},
  author={Yu, Tianyu and Wang, Zefan and Wang, Chongyi and Huang, Fuwei and Ma, Wenshuo and He, Zhihui and Cai, Tianchi and Chen, Weize and Huang, Yuxiang and Zhao, Yuanqian and others},
  journal={arXiv preprint arXiv:2509.18154},
  year={2025}
}

@article{zhang2025tool,
  title={Tool-R1: Sample-Efficient Reinforcement Learning for Agentic Tool Use},
  author={Zhang, Yabo and Zeng, Yihan and Li, Qingyun and Hu, Zhen and Han, Kavin and Zuo, Wangmeng},
  journal={arXiv preprint arXiv:2509.12867},
  year={2025}
}

@article{ghasemipour2025self,
  title={Self-Improving Embodied Foundation Models},
  author={Ghasemipour, Seyed Kamyar Seyed and Wahid, Ayzaan and Tompson, Jonathan and Sanketi, Pannag and Mordatch, Igor},
  journal={arXiv preprint arXiv:2509.15155},
  year={2025}
}

@article{zhao2025learninggui,
  title={Learning GUI Grounding with Spatial Reasoning from Visual Feedback},
  author={Zhao, Yu and Chen, Wei-Ning and Inan, Huseyin Atahan and Kessler, Samuel and Wang, Lu and Wutschitz, Lukas and Yang, Fangkai and Zhang, Chaoyun and Minervini, Pasquale and Rajmohan, Saravan and others},
  journal={arXiv preprint arXiv:2509.21552},
  year={2025}
}

@article{chen2025perception,
  title={Perception Before Reasoning: Two-Stage Reinforcement Learning for Visual Reasoning in Vision-Language Models},
  author={Chen, Yan and Li, Long and Xi, Teng and Zeng, Long and Wang, Jingdong},
  journal={arXiv preprint arXiv:2509.13031},
  year={2025}
}

@article{liu2025visual,
  title={Visual-rft: Visual reinforcement fine-tuning},
  author={Liu, Ziyu and Sun, Zeyi and Zang, Yuhang and Dong, Xiaoyi and Cao, Yuhang and Duan, Haodong and Lin, Dahua and Wang, Jiaqi},
  journal={arXiv preprint arXiv:2503.01785},
  year={2025},
  note={a}
}
